library packages
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
"""A variety of linear models."""
|
||||
|
||||
# See http://scikit-learn.sourceforge.net/modules/sgd.html and
|
||||
# http://scikit-learn.sourceforge.net/modules/linear_model.html for
|
||||
# complete documentation.
|
||||
|
||||
from ._base import LinearRegression
|
||||
from ._bayes import ARDRegression, BayesianRidge
|
||||
from ._coordinate_descent import (
|
||||
ElasticNet,
|
||||
ElasticNetCV,
|
||||
Lasso,
|
||||
LassoCV,
|
||||
MultiTaskElasticNet,
|
||||
MultiTaskElasticNetCV,
|
||||
MultiTaskLasso,
|
||||
MultiTaskLassoCV,
|
||||
enet_path,
|
||||
lasso_path,
|
||||
)
|
||||
from ._glm import GammaRegressor, PoissonRegressor, TweedieRegressor
|
||||
from ._huber import HuberRegressor
|
||||
from ._least_angle import (
|
||||
Lars,
|
||||
LarsCV,
|
||||
LassoLars,
|
||||
LassoLarsCV,
|
||||
LassoLarsIC,
|
||||
lars_path,
|
||||
lars_path_gram,
|
||||
)
|
||||
from ._logistic import LogisticRegression, LogisticRegressionCV
|
||||
from ._omp import (
|
||||
OrthogonalMatchingPursuit,
|
||||
OrthogonalMatchingPursuitCV,
|
||||
orthogonal_mp,
|
||||
orthogonal_mp_gram,
|
||||
)
|
||||
from ._passive_aggressive import PassiveAggressiveClassifier, PassiveAggressiveRegressor
|
||||
from ._perceptron import Perceptron
|
||||
from ._quantile import QuantileRegressor
|
||||
from ._ransac import RANSACRegressor
|
||||
from ._ridge import Ridge, RidgeClassifier, RidgeClassifierCV, RidgeCV, ridge_regression
|
||||
from ._sgd_fast import Hinge, Huber, Log, ModifiedHuber, SquaredLoss
|
||||
from ._stochastic_gradient import SGDClassifier, SGDOneClassSVM, SGDRegressor
|
||||
from ._theil_sen import TheilSenRegressor
|
||||
|
||||
__all__ = [
|
||||
"ARDRegression",
|
||||
"BayesianRidge",
|
||||
"ElasticNet",
|
||||
"ElasticNetCV",
|
||||
"Hinge",
|
||||
"Huber",
|
||||
"HuberRegressor",
|
||||
"Lars",
|
||||
"LarsCV",
|
||||
"Lasso",
|
||||
"LassoCV",
|
||||
"LassoLars",
|
||||
"LassoLarsCV",
|
||||
"LassoLarsIC",
|
||||
"LinearRegression",
|
||||
"Log",
|
||||
"LogisticRegression",
|
||||
"LogisticRegressionCV",
|
||||
"ModifiedHuber",
|
||||
"MultiTaskElasticNet",
|
||||
"MultiTaskElasticNetCV",
|
||||
"MultiTaskLasso",
|
||||
"MultiTaskLassoCV",
|
||||
"OrthogonalMatchingPursuit",
|
||||
"OrthogonalMatchingPursuitCV",
|
||||
"PassiveAggressiveClassifier",
|
||||
"PassiveAggressiveRegressor",
|
||||
"Perceptron",
|
||||
"QuantileRegressor",
|
||||
"Ridge",
|
||||
"RidgeCV",
|
||||
"RidgeClassifier",
|
||||
"RidgeClassifierCV",
|
||||
"SGDClassifier",
|
||||
"SGDRegressor",
|
||||
"SGDOneClassSVM",
|
||||
"SquaredLoss",
|
||||
"TheilSenRegressor",
|
||||
"enet_path",
|
||||
"lars_path",
|
||||
"lars_path_gram",
|
||||
"lasso_path",
|
||||
"orthogonal_mp",
|
||||
"orthogonal_mp_gram",
|
||||
"ridge_regression",
|
||||
"RANSACRegressor",
|
||||
"PoissonRegressor",
|
||||
"GammaRegressor",
|
||||
"TweedieRegressor",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
850
.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py
Normal file
850
.venv/lib/python3.12/site-packages/sklearn/linear_model/_base.py
Normal file
@@ -0,0 +1,850 @@
|
||||
"""
|
||||
Generalized Linear Models.
|
||||
"""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
# Olivier Grisel <olivier.grisel@ensta.org>
|
||||
# Vincent Michel <vincent.michel@inria.fr>
|
||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Mathieu Blondel <mathieu@mblondel.org>
|
||||
# Lars Buitinck
|
||||
# Maryan Morel <maryan.morel@polytechnique.edu>
|
||||
# Giorgio Patrini <giorgio.patrini@anu.edu.au>
|
||||
# Maria Telenczuk <https://github.com/maikia>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numbers
|
||||
import warnings
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from scipy import linalg, optimize, sparse
|
||||
from scipy.sparse.linalg import lsqr
|
||||
from scipy.special import expit
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassifierMixin,
|
||||
MultiOutputMixin,
|
||||
RegressorMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from ..utils import check_array, check_random_state
|
||||
from ..utils._array_api import (
|
||||
_asarray_with_order,
|
||||
_average,
|
||||
get_namespace,
|
||||
get_namespace_and_device,
|
||||
indexing_dtype,
|
||||
supported_float_dtypes,
|
||||
)
|
||||
from ..utils._seq_dataset import (
|
||||
ArrayDataset32,
|
||||
ArrayDataset64,
|
||||
CSRDataset32,
|
||||
CSRDataset64,
|
||||
)
|
||||
from ..utils.extmath import safe_sparse_dot
|
||||
from ..utils.parallel import Parallel, delayed
|
||||
from ..utils.sparsefuncs import mean_variance_axis
|
||||
from ..utils.validation import _check_sample_weight, check_is_fitted
|
||||
|
||||
# TODO: bayesian_ridge_regression and bayesian_regression_ard
|
||||
# should be squashed into its respective objects.
|
||||
|
||||
SPARSE_INTERCEPT_DECAY = 0.01
|
||||
# For sparse data intercept updates are scaled by this decay factor to avoid
|
||||
# intercept oscillation.
|
||||
|
||||
|
||||
def make_dataset(X, y, sample_weight, random_state=None):
|
||||
"""Create ``Dataset`` abstraction for sparse and dense inputs.
|
||||
|
||||
This also returns the ``intercept_decay`` which is different
|
||||
for sparse datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training data
|
||||
|
||||
y : array-like, shape (n_samples, )
|
||||
Target values.
|
||||
|
||||
sample_weight : numpy array of shape (n_samples,)
|
||||
The weight of each sample
|
||||
|
||||
random_state : int, RandomState instance or None (default)
|
||||
Determines random number generation for dataset random sampling. It is not
|
||||
used for dataset shuffling.
|
||||
Pass an int for reproducible output across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dataset
|
||||
The ``Dataset`` abstraction
|
||||
intercept_decay
|
||||
The intercept decay
|
||||
"""
|
||||
|
||||
rng = check_random_state(random_state)
|
||||
# seed should never be 0 in SequentialDataset64
|
||||
seed = rng.randint(1, np.iinfo(np.int32).max)
|
||||
|
||||
if X.dtype == np.float32:
|
||||
CSRData = CSRDataset32
|
||||
ArrayData = ArrayDataset32
|
||||
else:
|
||||
CSRData = CSRDataset64
|
||||
ArrayData = ArrayDataset64
|
||||
|
||||
if sp.issparse(X):
|
||||
dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight, seed=seed)
|
||||
intercept_decay = SPARSE_INTERCEPT_DECAY
|
||||
else:
|
||||
X = np.ascontiguousarray(X)
|
||||
dataset = ArrayData(X, y, sample_weight, seed=seed)
|
||||
intercept_decay = 1.0
|
||||
|
||||
return dataset, intercept_decay
|
||||
|
||||
|
||||
def _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
*,
|
||||
fit_intercept,
|
||||
copy=True,
|
||||
copy_y=True,
|
||||
sample_weight=None,
|
||||
check_input=True,
|
||||
):
|
||||
"""Common data preprocessing for fitting linear models.
|
||||
|
||||
This helper is in charge of the following steps:
|
||||
|
||||
- Ensure that `sample_weight` is an array or `None`.
|
||||
- If `check_input=True`, perform standard input validation of `X`, `y`.
|
||||
- Perform copies if requested to avoid side-effects in case of inplace
|
||||
modifications of the input.
|
||||
|
||||
Then, if `fit_intercept=True` this preprocessing centers both `X` and `y` as
|
||||
follows:
|
||||
- if `X` is dense, center the data and
|
||||
store the mean vector in `X_offset`.
|
||||
- if `X` is sparse, store the mean in `X_offset`
|
||||
without centering `X`. The centering is expected to be handled by the
|
||||
linear solver where appropriate.
|
||||
- in either case, always center `y` and store the mean in `y_offset`.
|
||||
- both `X_offset` and `y_offset` are always weighted by `sample_weight`
|
||||
if not set to `None`.
|
||||
|
||||
If `fit_intercept=False`, no centering is performed and `X_offset`, `y_offset`
|
||||
are set to zero.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_out : {ndarray, sparse matrix} of shape (n_samples, n_features)
|
||||
If copy=True a copy of the input X is triggered, otherwise operations are
|
||||
inplace.
|
||||
If input X is dense, then X_out is centered.
|
||||
y_out : {ndarray, sparse matrix} of shape (n_samples,) or (n_samples, n_targets)
|
||||
Centered version of y. Possibly performed inplace on input y depending
|
||||
on the copy_y parameter.
|
||||
X_offset : ndarray of shape (n_features,)
|
||||
The mean per column of input X.
|
||||
y_offset : float or ndarray of shape (n_features,)
|
||||
X_scale : ndarray of shape (n_features,)
|
||||
Always an array of ones. TODO: refactor the code base to make it
|
||||
possible to remove this unused variable.
|
||||
"""
|
||||
xp, _, device_ = get_namespace_and_device(X, y, sample_weight)
|
||||
n_samples, n_features = X.shape
|
||||
X_is_sparse = sp.issparse(X)
|
||||
|
||||
if isinstance(sample_weight, numbers.Number):
|
||||
sample_weight = None
|
||||
if sample_weight is not None:
|
||||
sample_weight = xp.asarray(sample_weight)
|
||||
|
||||
if check_input:
|
||||
X = check_array(
|
||||
X, copy=copy, accept_sparse=["csr", "csc"], dtype=supported_float_dtypes(xp)
|
||||
)
|
||||
y = check_array(y, dtype=X.dtype, copy=copy_y, ensure_2d=False)
|
||||
else:
|
||||
y = xp.astype(y, X.dtype, copy=copy_y)
|
||||
if copy:
|
||||
if X_is_sparse:
|
||||
X = X.copy()
|
||||
else:
|
||||
X = _asarray_with_order(X, order="K", copy=True, xp=xp)
|
||||
|
||||
dtype_ = X.dtype
|
||||
|
||||
if fit_intercept:
|
||||
if X_is_sparse:
|
||||
X_offset, X_var = mean_variance_axis(X, axis=0, weights=sample_weight)
|
||||
else:
|
||||
X_offset = _average(X, axis=0, weights=sample_weight, xp=xp)
|
||||
|
||||
X_offset = xp.astype(X_offset, X.dtype, copy=False)
|
||||
X -= X_offset
|
||||
|
||||
y_offset = _average(y, axis=0, weights=sample_weight, xp=xp)
|
||||
y -= y_offset
|
||||
else:
|
||||
X_offset = xp.zeros(n_features, dtype=X.dtype, device=device_)
|
||||
if y.ndim == 1:
|
||||
y_offset = xp.asarray(0.0, dtype=dtype_, device=device_)
|
||||
else:
|
||||
y_offset = xp.zeros(y.shape[1], dtype=dtype_, device=device_)
|
||||
|
||||
# XXX: X_scale is no longer needed. It is an historic artifact from the
|
||||
# time where linear model exposed the normalize parameter.
|
||||
X_scale = xp.ones(n_features, dtype=X.dtype, device=device_)
|
||||
return X, y, X_offset, y_offset, X_scale
|
||||
|
||||
|
||||
# TODO: _rescale_data should be factored into _preprocess_data.
|
||||
# Currently, the fact that sag implements its own way to deal with
|
||||
# sample_weight makes the refactoring tricky.
|
||||
|
||||
|
||||
def _rescale_data(X, y, sample_weight, inplace=False):
|
||||
"""Rescale data sample-wise by square root of sample_weight.
|
||||
|
||||
For many linear models, this enables easy support for sample_weight because
|
||||
|
||||
(y - X w)' S (y - X w)
|
||||
|
||||
with S = diag(sample_weight) becomes
|
||||
|
||||
||y_rescaled - X_rescaled w||_2^2
|
||||
|
||||
when setting
|
||||
|
||||
y_rescaled = sqrt(S) y
|
||||
X_rescaled = sqrt(S) X
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_rescaled : {array-like, sparse matrix}
|
||||
|
||||
y_rescaled : {array-like, sparse matrix}
|
||||
"""
|
||||
# Assume that _validate_data and _check_sample_weight have been called by
|
||||
# the caller.
|
||||
xp, _ = get_namespace(X, y, sample_weight)
|
||||
n_samples = X.shape[0]
|
||||
sample_weight_sqrt = xp.sqrt(sample_weight)
|
||||
|
||||
if sp.issparse(X) or sp.issparse(y):
|
||||
sw_matrix = sparse.dia_matrix(
|
||||
(sample_weight_sqrt, 0), shape=(n_samples, n_samples)
|
||||
)
|
||||
|
||||
if sp.issparse(X):
|
||||
X = safe_sparse_dot(sw_matrix, X)
|
||||
else:
|
||||
if inplace:
|
||||
X *= sample_weight_sqrt[:, None]
|
||||
else:
|
||||
X = X * sample_weight_sqrt[:, None]
|
||||
|
||||
if sp.issparse(y):
|
||||
y = safe_sparse_dot(sw_matrix, y)
|
||||
else:
|
||||
if inplace:
|
||||
if y.ndim == 1:
|
||||
y *= sample_weight_sqrt
|
||||
else:
|
||||
y *= sample_weight_sqrt[:, None]
|
||||
else:
|
||||
if y.ndim == 1:
|
||||
y = y * sample_weight_sqrt
|
||||
else:
|
||||
y = y * sample_weight_sqrt[:, None]
|
||||
return X, y, sample_weight_sqrt
|
||||
|
||||
|
||||
class LinearModel(BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for Linear Models"""
|
||||
|
||||
@abstractmethod
|
||||
def fit(self, X, y):
|
||||
"""Fit model."""
|
||||
|
||||
def _decision_function(self, X):
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False)
|
||||
coef_ = self.coef_
|
||||
if coef_.ndim == 1:
|
||||
return X @ coef_ + self.intercept_
|
||||
else:
|
||||
return X @ coef_.T + self.intercept_
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict using the linear model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like or sparse matrix, shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : array, shape (n_samples,)
|
||||
Returns predicted values.
|
||||
"""
|
||||
return self._decision_function(X)
|
||||
|
||||
def _set_intercept(self, X_offset, y_offset, X_scale):
|
||||
"""Set the intercept_"""
|
||||
|
||||
xp, _ = get_namespace(X_offset, y_offset, X_scale)
|
||||
|
||||
if self.fit_intercept:
|
||||
# We always want coef_.dtype=X.dtype. For instance, X.dtype can differ from
|
||||
# coef_.dtype if warm_start=True.
|
||||
coef_ = xp.astype(self.coef_, X_scale.dtype, copy=False)
|
||||
coef_ = self.coef_ = xp.divide(coef_, X_scale)
|
||||
|
||||
if coef_.ndim == 1:
|
||||
intercept_ = y_offset - X_offset @ coef_
|
||||
else:
|
||||
intercept_ = y_offset - X_offset @ coef_.T
|
||||
|
||||
self.intercept_ = intercept_
|
||||
|
||||
else:
|
||||
self.intercept_ = 0.0
|
||||
|
||||
def _more_tags(self):
|
||||
return {"requires_y": True}
|
||||
|
||||
|
||||
# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
|
||||
# Maybe the n_features checking can be moved to LinearModel.
|
||||
class LinearClassifierMixin(ClassifierMixin):
|
||||
"""Mixin for linear classifiers.
|
||||
|
||||
Handles prediction for sparse and dense X.
|
||||
"""
|
||||
|
||||
def decision_function(self, X):
|
||||
"""
|
||||
Predict confidence scores for samples.
|
||||
|
||||
The confidence score for a sample is proportional to the signed
|
||||
distance of that sample to the hyperplane.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The data matrix for which we want to get the confidence scores.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scores : ndarray of shape (n_samples,) or (n_samples, n_classes)
|
||||
Confidence scores per `(n_samples, n_classes)` combination. In the
|
||||
binary case, confidence score for `self.classes_[1]` where >0 means
|
||||
this class would be predicted.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
xp, _ = get_namespace(X)
|
||||
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
|
||||
return xp.reshape(scores, (-1,)) if scores.shape[1] == 1 else scores
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict class labels for samples in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
The data matrix for which we want to get the predictions.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : ndarray of shape (n_samples,)
|
||||
Vector containing the class labels for each sample.
|
||||
"""
|
||||
xp, _ = get_namespace(X)
|
||||
scores = self.decision_function(X)
|
||||
if len(scores.shape) == 1:
|
||||
indices = xp.astype(scores > 0, indexing_dtype(xp))
|
||||
else:
|
||||
indices = xp.argmax(scores, axis=1)
|
||||
|
||||
return xp.take(self.classes_, indices, axis=0)
|
||||
|
||||
def _predict_proba_lr(self, X):
|
||||
"""Probability estimation for OvR logistic regression.
|
||||
|
||||
Positive class probabilities are computed as
|
||||
1. / (1. + np.exp(-self.decision_function(X)));
|
||||
multiclass is handled by normalizing that over all classes.
|
||||
"""
|
||||
prob = self.decision_function(X)
|
||||
expit(prob, out=prob)
|
||||
if prob.ndim == 1:
|
||||
return np.vstack([1 - prob, prob]).T
|
||||
else:
|
||||
# OvR normalization, like LibLinear's predict_probability
|
||||
prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
|
||||
return prob
|
||||
|
||||
|
||||
class SparseCoefMixin:
|
||||
"""Mixin for converting coef_ to and from CSR format.
|
||||
|
||||
L1-regularizing estimators should inherit this.
|
||||
"""
|
||||
|
||||
def densify(self):
|
||||
"""
|
||||
Convert coefficient matrix to dense array format.
|
||||
|
||||
Converts the ``coef_`` member (back) to a numpy.ndarray. This is the
|
||||
default format of ``coef_`` and is required for fitting, so calling
|
||||
this method is only required on models that have previously been
|
||||
sparsified; otherwise, it is a no-op.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
msg = "Estimator, %(name)s, must be fitted before densifying."
|
||||
check_is_fitted(self, msg=msg)
|
||||
if sp.issparse(self.coef_):
|
||||
self.coef_ = self.coef_.toarray()
|
||||
return self
|
||||
|
||||
def sparsify(self):
|
||||
"""
|
||||
Convert coefficient matrix to sparse format.
|
||||
|
||||
Converts the ``coef_`` member to a scipy.sparse matrix, which for
|
||||
L1-regularized models can be much more memory- and storage-efficient
|
||||
than the usual numpy.ndarray representation.
|
||||
|
||||
The ``intercept_`` member is not converted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For non-sparse models, i.e. when there are not many zeros in ``coef_``,
|
||||
this may actually *increase* memory usage, so use this method with
|
||||
care. A rule of thumb is that the number of zero elements, which can
|
||||
be computed with ``(coef_ == 0).sum()``, must be more than 50% for this
|
||||
to provide significant benefits.
|
||||
|
||||
After calling this method, further fitting with the partial_fit
|
||||
method (if any) will not work until you call densify.
|
||||
"""
|
||||
msg = "Estimator, %(name)s, must be fitted before sparsifying."
|
||||
check_is_fitted(self, msg=msg)
|
||||
self.coef_ = sp.csr_matrix(self.coef_)
|
||||
return self
|
||||
|
||||
|
||||
class LinearRegression(MultiOutputMixin, RegressorMixin, LinearModel):
|
||||
"""
|
||||
Ordinary least squares Linear Regression.
|
||||
|
||||
LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
|
||||
to minimize the residual sum of squares between the observed targets in
|
||||
the dataset, and the targets predicted by the linear approximation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model. If set
|
||||
to False, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. This will only provide
|
||||
speedup in case of sufficiently large problems, that is if firstly
|
||||
`n_targets > 1` and secondly `X` is sparse or if `positive` is set
|
||||
to `True`. ``None`` means 1 unless in a
|
||||
:obj:`joblib.parallel_backend` context. ``-1`` means using all
|
||||
processors. See :term:`Glossary <n_jobs>` for more details.
|
||||
|
||||
positive : bool, default=False
|
||||
When set to ``True``, forces the coefficients to be positive. This
|
||||
option is only supported for dense arrays.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features, ) or (n_targets, n_features)
|
||||
Estimated coefficients for the linear regression problem.
|
||||
If multiple targets are passed during the fit (y 2D), this
|
||||
is a 2D array of shape (n_targets, n_features), while if only
|
||||
one target is passed, this is a 1D array of length n_features.
|
||||
|
||||
rank_ : int
|
||||
Rank of matrix `X`. Only available when `X` is dense.
|
||||
|
||||
singular_ : array of shape (min(X, y),)
|
||||
Singular values of `X`. Only available when `X` is dense.
|
||||
|
||||
intercept_ : float or array of shape (n_targets,)
|
||||
Independent term in the linear model. Set to 0.0 if
|
||||
`fit_intercept = False`.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
Ridge : Ridge regression addresses some of the
|
||||
problems of Ordinary Least Squares by imposing a penalty on the
|
||||
size of the coefficients with l2 regularization.
|
||||
Lasso : The Lasso is a linear model that estimates
|
||||
sparse coefficients with l1 regularization.
|
||||
ElasticNet : Elastic-Net is a linear regression
|
||||
model trained with both l1 and l2 -norm regularization of the
|
||||
coefficients.
|
||||
|
||||
Notes
|
||||
-----
|
||||
From the implementation point of view, this is just plain Ordinary
|
||||
Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
|
||||
(scipy.optimize.nnls) wrapped as a predictor object.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.linear_model import LinearRegression
|
||||
>>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
|
||||
>>> # y = 1 * x_0 + 2 * x_1 + 3
|
||||
>>> y = np.dot(X, np.array([1, 2])) + 3
|
||||
>>> reg = LinearRegression().fit(X, y)
|
||||
>>> reg.score(X, y)
|
||||
1.0
|
||||
>>> reg.coef_
|
||||
array([1., 2.])
|
||||
>>> reg.intercept_
|
||||
np.float64(3.0...)
|
||||
>>> reg.predict(np.array([[3, 5]]))
|
||||
array([16.])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
"n_jobs": [None, Integral],
|
||||
"positive": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
n_jobs=None,
|
||||
positive=False,
|
||||
):
|
||||
self.fit_intercept = fit_intercept
|
||||
self.copy_X = copy_X
|
||||
self.n_jobs = n_jobs
|
||||
self.positive = positive
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""
|
||||
Fit linear model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_targets)
|
||||
Target values. Will be cast to X's dtype if necessary.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Individual weights for each sample.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
parameter *sample_weight* support to LinearRegression.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted Estimator.
|
||||
"""
|
||||
n_jobs_ = self.n_jobs
|
||||
|
||||
accept_sparse = False if self.positive else ["csr", "csc", "coo"]
|
||||
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
accept_sparse=accept_sparse,
|
||||
y_numeric=True,
|
||||
multi_output=True,
|
||||
force_writeable=True,
|
||||
)
|
||||
|
||||
has_sw = sample_weight is not None
|
||||
if has_sw:
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=X.dtype, only_non_negative=True
|
||||
)
|
||||
|
||||
# Note that neither _rescale_data nor the rest of the fit method of
|
||||
# LinearRegression can benefit from in-place operations when X is a
|
||||
# sparse matrix. Therefore, let's not copy X when it is sparse.
|
||||
copy_X_in_preprocess_data = self.copy_X and not sp.issparse(X)
|
||||
|
||||
X, y, X_offset, y_offset, X_scale = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=self.fit_intercept,
|
||||
copy=copy_X_in_preprocess_data,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
if has_sw:
|
||||
# Sample weight can be implemented via a simple rescaling. Note
|
||||
# that we safely do inplace rescaling when _preprocess_data has
|
||||
# already made a copy if requested.
|
||||
X, y, sample_weight_sqrt = _rescale_data(
|
||||
X, y, sample_weight, inplace=copy_X_in_preprocess_data
|
||||
)
|
||||
|
||||
if self.positive:
|
||||
if y.ndim < 2:
|
||||
self.coef_ = optimize.nnls(X, y)[0]
|
||||
else:
|
||||
# scipy.optimize.nnls cannot handle y with shape (M, K)
|
||||
outs = Parallel(n_jobs=n_jobs_)(
|
||||
delayed(optimize.nnls)(X, y[:, j]) for j in range(y.shape[1])
|
||||
)
|
||||
self.coef_ = np.vstack([out[0] for out in outs])
|
||||
elif sp.issparse(X):
|
||||
X_offset_scale = X_offset / X_scale
|
||||
|
||||
if has_sw:
|
||||
|
||||
def matvec(b):
|
||||
return X.dot(b) - sample_weight_sqrt * b.dot(X_offset_scale)
|
||||
|
||||
def rmatvec(b):
|
||||
return X.T.dot(b) - X_offset_scale * b.dot(sample_weight_sqrt)
|
||||
|
||||
else:
|
||||
|
||||
def matvec(b):
|
||||
return X.dot(b) - b.dot(X_offset_scale)
|
||||
|
||||
def rmatvec(b):
|
||||
return X.T.dot(b) - X_offset_scale * b.sum()
|
||||
|
||||
X_centered = sparse.linalg.LinearOperator(
|
||||
shape=X.shape, matvec=matvec, rmatvec=rmatvec
|
||||
)
|
||||
|
||||
if y.ndim < 2:
|
||||
self.coef_ = lsqr(X_centered, y)[0]
|
||||
else:
|
||||
# sparse_lstsq cannot handle y with shape (M, K)
|
||||
outs = Parallel(n_jobs=n_jobs_)(
|
||||
delayed(lsqr)(X_centered, y[:, j].ravel())
|
||||
for j in range(y.shape[1])
|
||||
)
|
||||
self.coef_ = np.vstack([out[0] for out in outs])
|
||||
else:
|
||||
self.coef_, _, self.rank_, self.singular_ = linalg.lstsq(X, y)
|
||||
self.coef_ = self.coef_.T
|
||||
|
||||
if y.ndim == 1:
|
||||
self.coef_ = np.ravel(self.coef_)
|
||||
self._set_intercept(X_offset, y_offset, X_scale)
|
||||
return self
|
||||
|
||||
|
||||
def _check_precomputed_gram_matrix(
|
||||
X, precompute, X_offset, X_scale, rtol=None, atol=1e-5
|
||||
):
|
||||
"""Computes a single element of the gram matrix and compares it to
|
||||
the corresponding element of the user supplied gram matrix.
|
||||
|
||||
If the values do not match a ValueError will be thrown.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Data array.
|
||||
|
||||
precompute : array-like of shape (n_features, n_features)
|
||||
User-supplied gram matrix.
|
||||
|
||||
X_offset : ndarray of shape (n_features,)
|
||||
Array of feature means used to center design matrix.
|
||||
|
||||
X_scale : ndarray of shape (n_features,)
|
||||
Array of feature scale factors used to normalize design matrix.
|
||||
|
||||
rtol : float, default=None
|
||||
Relative tolerance; see numpy.allclose
|
||||
If None, it is set to 1e-4 for arrays of dtype numpy.float32 and 1e-7
|
||||
otherwise.
|
||||
|
||||
atol : float, default=1e-5
|
||||
absolute tolerance; see :func`numpy.allclose`. Note that the default
|
||||
here is more tolerant than the default for
|
||||
:func:`numpy.testing.assert_allclose`, where `atol=0`.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
Raised when the provided Gram matrix is not consistent.
|
||||
"""
|
||||
|
||||
n_features = X.shape[1]
|
||||
f1 = n_features // 2
|
||||
f2 = min(f1 + 1, n_features - 1)
|
||||
|
||||
v1 = (X[:, f1] - X_offset[f1]) * X_scale[f1]
|
||||
v2 = (X[:, f2] - X_offset[f2]) * X_scale[f2]
|
||||
|
||||
expected = np.dot(v1, v2)
|
||||
actual = precompute[f1, f2]
|
||||
|
||||
dtypes = [precompute.dtype, expected.dtype]
|
||||
if rtol is None:
|
||||
rtols = [1e-4 if dtype == np.float32 else 1e-7 for dtype in dtypes]
|
||||
rtol = max(rtols)
|
||||
|
||||
if not np.isclose(expected, actual, rtol=rtol, atol=atol):
|
||||
raise ValueError(
|
||||
"Gram matrix passed in via 'precompute' parameter "
|
||||
"did not pass validation when a single element was "
|
||||
"checked - please check that it was computed "
|
||||
f"properly. For element ({f1},{f2}) we computed "
|
||||
f"{expected} but the user-supplied value was "
|
||||
f"{actual}."
|
||||
)
|
||||
|
||||
|
||||
def _pre_fit(
|
||||
X,
|
||||
y,
|
||||
Xy,
|
||||
precompute,
|
||||
fit_intercept,
|
||||
copy,
|
||||
check_input=True,
|
||||
sample_weight=None,
|
||||
):
|
||||
"""Function used at beginning of fit in linear models with L1 or L0 penalty.
|
||||
|
||||
This function applies _preprocess_data and additionally computes the gram matrix
|
||||
`precompute` as needed as well as `Xy`.
|
||||
"""
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
if sparse.issparse(X):
|
||||
# copy is not needed here as X is not modified inplace when X is sparse
|
||||
precompute = False
|
||||
X, y, X_offset, y_offset, X_scale = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=fit_intercept,
|
||||
copy=False,
|
||||
check_input=check_input,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
else:
|
||||
# copy was done in fit if necessary
|
||||
X, y, X_offset, y_offset, X_scale = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=fit_intercept,
|
||||
copy=copy,
|
||||
check_input=check_input,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
# Rescale only in dense case. Sparse cd solver directly deals with
|
||||
# sample_weight.
|
||||
if sample_weight is not None:
|
||||
# This triggers copies anyway.
|
||||
X, y, _ = _rescale_data(X, y, sample_weight=sample_weight)
|
||||
|
||||
if hasattr(precompute, "__array__"):
|
||||
if fit_intercept and not np.allclose(X_offset, np.zeros(n_features)):
|
||||
warnings.warn(
|
||||
(
|
||||
"Gram matrix was provided but X was centered to fit "
|
||||
"intercept: recomputing Gram matrix."
|
||||
),
|
||||
UserWarning,
|
||||
)
|
||||
# TODO: instead of warning and recomputing, we could just center
|
||||
# the user provided Gram matrix a-posteriori (after making a copy
|
||||
# when `copy=True`).
|
||||
# recompute Gram
|
||||
precompute = "auto"
|
||||
Xy = None
|
||||
elif check_input:
|
||||
# If we're going to use the user's precomputed gram matrix, we
|
||||
# do a quick check to make sure its not totally bogus.
|
||||
_check_precomputed_gram_matrix(X, precompute, X_offset, X_scale)
|
||||
|
||||
# precompute if n_samples > n_features
|
||||
if isinstance(precompute, str) and precompute == "auto":
|
||||
precompute = n_samples > n_features
|
||||
|
||||
if precompute is True:
|
||||
# make sure that the 'precompute' array is contiguous.
|
||||
precompute = np.empty(shape=(n_features, n_features), dtype=X.dtype, order="C")
|
||||
np.dot(X.T, X, out=precompute)
|
||||
|
||||
if not hasattr(precompute, "__array__"):
|
||||
Xy = None # cannot use Xy if precompute is not Gram
|
||||
|
||||
if hasattr(precompute, "__array__") and Xy is None:
|
||||
common_dtype = np.result_type(X.dtype, y.dtype)
|
||||
if y.ndim == 1:
|
||||
# Xy is 1d, make sure it is contiguous.
|
||||
Xy = np.empty(shape=n_features, dtype=common_dtype, order="C")
|
||||
np.dot(X.T, y, out=Xy)
|
||||
else:
|
||||
# Make sure that Xy is always F contiguous even if X or y are not
|
||||
# contiguous: the goal is to make it fast to extract the data for a
|
||||
# specific target.
|
||||
n_targets = y.shape[1]
|
||||
Xy = np.empty(shape=(n_features, n_targets), dtype=common_dtype, order="F")
|
||||
np.dot(y.T, X, out=Xy.T)
|
||||
|
||||
return X, y, X_offset, y_offset, X_scale, precompute, Xy
|
||||
@@ -0,0 +1,791 @@
|
||||
"""
|
||||
Various bayesian regression
|
||||
"""
|
||||
|
||||
# Authors: V. Michel, F. Pedregosa, A. Gramfort
|
||||
# License: BSD 3 clause
|
||||
|
||||
from math import log
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.linalg import pinvh
|
||||
|
||||
from ..base import RegressorMixin, _fit_context
|
||||
from ..utils import _safe_indexing
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._base import LinearModel, _preprocess_data, _rescale_data
|
||||
|
||||
###############################################################################
|
||||
# BayesianRidge regression
|
||||
|
||||
|
||||
class BayesianRidge(RegressorMixin, LinearModel):
|
||||
"""Bayesian ridge regression.
|
||||
|
||||
Fit a Bayesian ridge model. See the Notes section for details on this
|
||||
implementation and the optimization of the regularization parameters
|
||||
lambda (precision of the weights) and alpha (precision of the noise).
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
For an intuitive visualization of how the sinusoid is approximated by
|
||||
a polynomial using different pairs of initial values, see
|
||||
:ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations over the complete dataset before
|
||||
stopping independently of any early stopping criterion.
|
||||
|
||||
.. versionchanged:: 1.3
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
alpha_init : float, default=None
|
||||
Initial value for alpha (precision of the noise).
|
||||
If not set, alpha_init is 1/Var(y).
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
lambda_init : float, default=None
|
||||
Initial value for lambda (precision of the weights).
|
||||
If not set, lambda_init is 1.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the log marginal likelihood at each iteration of the
|
||||
optimization.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model.
|
||||
The intercept is not treated as a probabilistic parameter
|
||||
and thus has no associated variance. If set
|
||||
to False, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
`fit_intercept = False`.
|
||||
|
||||
alpha_ : float
|
||||
Estimated precision of the noise.
|
||||
|
||||
lambda_ : float
|
||||
Estimated precision of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
Estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : array-like of shape (n_iter_+1,)
|
||||
If computed_score is True, value of the log marginal likelihood (to be
|
||||
maximized) at each iteration of the optimization. The array starts
|
||||
with the value of the log marginal likelihood obtained for the initial
|
||||
values of alpha and lambda and ends with the value obtained for the
|
||||
estimated alpha and lambda.
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
X_offset_ : ndarray of shape (n_features,)
|
||||
If `fit_intercept=True`, offset subtracted for centering data to a
|
||||
zero mean. Set to np.zeros(n_features) otherwise.
|
||||
|
||||
X_scale_ : ndarray of shape (n_features,)
|
||||
Set to np.ones(n_features).
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
ARDRegression : Bayesian ARD regression.
|
||||
|
||||
Notes
|
||||
-----
|
||||
There exist several strategies to perform Bayesian ridge regression. This
|
||||
implementation is based on the algorithm described in Appendix A of
|
||||
(Tipping, 2001) where updates of the regularization parameters are done as
|
||||
suggested in (MacKay, 1992). Note that according to A New
|
||||
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
|
||||
update rules do not guarantee that the marginal likelihood is increasing
|
||||
between two consecutive iterations of the optimization.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
|
||||
Vol. 4, No. 3, 1992.
|
||||
|
||||
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
|
||||
Journal of Machine Learning Research, Vol. 1, 2001.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.BayesianRidge()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
BayesianRidge()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="neither")],
|
||||
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_init": [None, Interval(Real, 0, None, closed="left")],
|
||||
"lambda_init": [None, Interval(Real, 0, None, closed="left")],
|
||||
"compute_score": ["boolean"],
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_iter=300,
|
||||
tol=1.0e-3,
|
||||
alpha_1=1.0e-6,
|
||||
alpha_2=1.0e-6,
|
||||
lambda_1=1.0e-6,
|
||||
lambda_2=1.0e-6,
|
||||
alpha_init=None,
|
||||
lambda_init=None,
|
||||
compute_score=False,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
verbose=False,
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.alpha_init = alpha_init
|
||||
self.lambda_init = lambda_init
|
||||
self.compute_score = compute_score
|
||||
self.fit_intercept = fit_intercept
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values. Will be cast to X's dtype if necessary.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), default=None
|
||||
Individual weights for each sample.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
parameter *sample_weight* support to BayesianRidge.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X, y, dtype=[np.float64, np.float32], force_writeable=True, y_numeric=True
|
||||
)
|
||||
dtype = X.dtype
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=dtype)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=self.fit_intercept,
|
||||
copy=self.copy_X,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
if sample_weight is not None:
|
||||
# Sample weight can be implemented via a simple rescaling.
|
||||
X, y, _ = _rescale_data(X, y, sample_weight)
|
||||
|
||||
self.X_offset_ = X_offset_
|
||||
self.X_scale_ = X_scale_
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero
|
||||
alpha_ = self.alpha_init
|
||||
lambda_ = self.lambda_init
|
||||
if alpha_ is None:
|
||||
alpha_ = 1.0 / (np.var(y) + eps)
|
||||
if lambda_ is None:
|
||||
lambda_ = 1.0
|
||||
|
||||
# Avoid unintended type promotion to float64 with numpy 2
|
||||
alpha_ = np.asarray(alpha_, dtype=dtype)
|
||||
lambda_ = np.asarray(lambda_, dtype=dtype)
|
||||
|
||||
verbose = self.verbose
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
XT_y = np.dot(X.T, y)
|
||||
U, S, Vh = linalg.svd(X, full_matrices=False)
|
||||
eigen_vals_ = S**2
|
||||
|
||||
# Convergence loop of the bayesian ridge regression
|
||||
for iter_ in range(self.max_iter):
|
||||
# update posterior mean coef_ based on alpha_ and lambda_ and
|
||||
# compute corresponding rmse
|
||||
coef_, rmse_ = self._update_coef_(
|
||||
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(
|
||||
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||||
)
|
||||
self.scores_.append(s)
|
||||
|
||||
# Update alpha and lambda according to (MacKay, 1992)
|
||||
gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
|
||||
lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
|
||||
alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
|
||||
|
||||
# Check for convergence
|
||||
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Convergence after ", str(iter_), " iterations")
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
self.n_iter_ = iter_ + 1
|
||||
|
||||
# return regularization parameters and corresponding posterior mean,
|
||||
# log marginal likelihood and posterior covariance
|
||||
self.alpha_ = alpha_
|
||||
self.lambda_ = lambda_
|
||||
self.coef_, rmse_ = self._update_coef_(
|
||||
X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(
|
||||
n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
|
||||
)
|
||||
self.scores_.append(s)
|
||||
self.scores_ = np.array(self.scores_)
|
||||
|
||||
# posterior covariance is given by 1/alpha_ * scaled_sigma_
|
||||
scaled_sigma_ = np.dot(
|
||||
Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
|
||||
)
|
||||
self.sigma_ = (1.0 / alpha_) * scaled_sigma_
|
||||
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if not return_std:
|
||||
return y_mean
|
||||
else:
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||||
return y_mean, y_std
|
||||
|
||||
def _update_coef_(
|
||||
self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
|
||||
):
|
||||
"""Update posterior mean and compute corresponding rmse.
|
||||
|
||||
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
|
||||
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
|
||||
+ np.dot(X.T, X))^-1
|
||||
"""
|
||||
|
||||
if n_samples > n_features:
|
||||
coef_ = np.linalg.multi_dot(
|
||||
[Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
|
||||
)
|
||||
else:
|
||||
coef_ = np.linalg.multi_dot(
|
||||
[X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
|
||||
)
|
||||
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
|
||||
return coef_, rmse_
|
||||
|
||||
def _log_marginal_likelihood(
|
||||
self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
|
||||
):
|
||||
"""Log marginal likelihood."""
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
|
||||
# compute the log of the determinant of the posterior covariance.
|
||||
# posterior covariance is given by
|
||||
# sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
|
||||
if n_samples > n_features:
|
||||
logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
|
||||
else:
|
||||
logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
|
||||
logdet_sigma[:n_samples] += alpha_ * eigen_vals
|
||||
logdet_sigma = -np.sum(np.log(logdet_sigma))
|
||||
|
||||
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
|
||||
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
score += 0.5 * (
|
||||
n_features * log(lambda_)
|
||||
+ n_samples * log(alpha_)
|
||||
- alpha_ * rmse
|
||||
- lambda_ * np.sum(coef**2)
|
||||
+ logdet_sigma
|
||||
- n_samples * log(2 * np.pi)
|
||||
)
|
||||
|
||||
return score
|
||||
|
||||
|
||||
###############################################################################
|
||||
# ARD (Automatic Relevance Determination) regression
|
||||
|
||||
|
||||
class ARDRegression(RegressorMixin, LinearModel):
|
||||
"""Bayesian ARD regression.
|
||||
|
||||
Fit the weights of a regression model, using an ARD prior. The weights of
|
||||
the regression model are assumed to be in Gaussian distributions.
|
||||
Also estimate the parameters lambda (precisions of the distributions of the
|
||||
weights) and alpha (precision of the distribution of the noise).
|
||||
The estimation is done by an iterative procedures (Evidence Maximization)
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations.
|
||||
|
||||
.. versionchanged:: 1.3
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the objective function at each step of the model.
|
||||
|
||||
threshold_lambda : float, default=10 000
|
||||
Threshold for removing (pruning) weights with high precision from
|
||||
the computation.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model. If set
|
||||
to false, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
alpha_ : float
|
||||
estimated precision of the noise.
|
||||
|
||||
lambda_ : array-like of shape (n_features,)
|
||||
estimated precisions of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : float
|
||||
if computed, value of the objective function (to be maximized)
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
``fit_intercept = False``.
|
||||
|
||||
X_offset_ : float
|
||||
If `fit_intercept=True`, offset subtracted for centering data to a
|
||||
zero mean. Set to np.zeros(n_features) otherwise.
|
||||
|
||||
X_scale_ : float
|
||||
Set to np.ones(n_features).
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
BayesianRidge : Bayesian ridge regression.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/linear_model/plot_ard.py
|
||||
<sphx_glr_auto_examples_linear_model_plot_ard.py>`.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
|
||||
competition, ASHRAE Transactions, 1994.
|
||||
|
||||
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
|
||||
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
|
||||
Their beta is our ``self.alpha_``
|
||||
Their alpha is our ``self.lambda_``
|
||||
ARD is a little different than the slide: only dimensions/features for
|
||||
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
|
||||
discarded.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.ARDRegression()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
ARDRegression()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_1": [Interval(Real, 0, None, closed="left")],
|
||||
"alpha_2": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_1": [Interval(Real, 0, None, closed="left")],
|
||||
"lambda_2": [Interval(Real, 0, None, closed="left")],
|
||||
"compute_score": ["boolean"],
|
||||
"threshold_lambda": [Interval(Real, 0, None, closed="left")],
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
max_iter=300,
|
||||
tol=1.0e-3,
|
||||
alpha_1=1.0e-6,
|
||||
alpha_2=1.0e-6,
|
||||
lambda_1=1.0e-6,
|
||||
lambda_2=1.0e-6,
|
||||
compute_score=False,
|
||||
threshold_lambda=1.0e4,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
verbose=False,
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.fit_intercept = fit_intercept
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.compute_score = compute_score
|
||||
self.threshold_lambda = threshold_lambda
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit the model according to the given training data and parameters.
|
||||
|
||||
Iterative procedure to maximize the evidence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values (integers). Will be cast to X's dtype if necessary.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
dtype=[np.float64, np.float32],
|
||||
force_writeable=True,
|
||||
y_numeric=True,
|
||||
ensure_min_samples=2,
|
||||
)
|
||||
dtype = X.dtype
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
coef_ = np.zeros(n_features, dtype=dtype)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
|
||||
X, y, fit_intercept=self.fit_intercept, copy=self.copy_X
|
||||
)
|
||||
|
||||
self.X_offset_ = X_offset_
|
||||
self.X_scale_ = X_scale_
|
||||
|
||||
# Launch the convergence loop
|
||||
keep_lambda = np.ones(n_features, dtype=bool)
|
||||
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
verbose = self.verbose
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero.
|
||||
# Explicitly set dtype to avoid unintended type promotion with numpy 2.
|
||||
alpha_ = np.asarray(1.0 / (np.var(y) + eps), dtype=dtype)
|
||||
lambda_ = np.ones(n_features, dtype=dtype)
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
|
||||
coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
|
||||
[sigma_, X[:, keep_lambda].T, y]
|
||||
)
|
||||
return coef_
|
||||
|
||||
update_sigma = (
|
||||
self._update_sigma
|
||||
if n_samples >= n_features
|
||||
else self._update_sigma_woodbury
|
||||
)
|
||||
# Iterative procedure of ARDRegression
|
||||
for iter_ in range(self.max_iter):
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
|
||||
# Update alpha and lambda
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
|
||||
lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
|
||||
(coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
|
||||
)
|
||||
alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
|
||||
rmse_ + 2.0 * alpha_2
|
||||
)
|
||||
|
||||
# Prune the weights with a precision over a threshold
|
||||
keep_lambda = lambda_ < self.threshold_lambda
|
||||
coef_[~keep_lambda] = 0
|
||||
|
||||
# Compute the objective function
|
||||
if self.compute_score:
|
||||
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
|
||||
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
s += 0.5 * (
|
||||
fast_logdet(sigma_)
|
||||
+ n_samples * log(alpha_)
|
||||
+ np.sum(np.log(lambda_))
|
||||
)
|
||||
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
|
||||
self.scores_.append(s)
|
||||
|
||||
# Check for convergence
|
||||
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Converged after %s iterations" % iter_)
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
if not keep_lambda.any():
|
||||
break
|
||||
|
||||
self.n_iter_ = iter_ + 1
|
||||
|
||||
if keep_lambda.any():
|
||||
# update sigma and mu using updated params from the last iteration
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
else:
|
||||
sigma_ = np.array([]).reshape(0, 0)
|
||||
|
||||
self.coef_ = coef_
|
||||
self.alpha_ = alpha_
|
||||
self.sigma_ = sigma_
|
||||
self.lambda_ = lambda_
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
return self
|
||||
|
||||
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples < n_features and will invert
|
||||
# a matrix of shape (n_samples, n_samples) making use of the
|
||||
# woodbury formula:
|
||||
# https://en.wikipedia.org/wiki/Woodbury_matrix_identity
|
||||
n_samples = X.shape[0]
|
||||
X_keep = X[:, keep_lambda]
|
||||
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
|
||||
sigma_ = pinvh(
|
||||
np.eye(n_samples, dtype=X.dtype) / alpha_
|
||||
+ np.dot(X_keep * inv_lambda, X_keep.T)
|
||||
)
|
||||
sigma_ = np.dot(sigma_, X_keep * inv_lambda)
|
||||
sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
|
||||
sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
|
||||
return sigma_
|
||||
|
||||
def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples >= n_features and will
|
||||
# invert a matrix of shape (n_features, n_features)
|
||||
X_keep = X[:, keep_lambda]
|
||||
gram = np.dot(X_keep.T, X_keep)
|
||||
eye = np.eye(gram.shape[0], dtype=X.dtype)
|
||||
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
|
||||
sigma_ = pinvh(sigma_inv)
|
||||
return sigma_
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if return_std is False:
|
||||
return y_mean
|
||||
else:
|
||||
col_index = self.lambda_ < self.threshold_lambda
|
||||
X = _safe_indexing(X, indices=col_index, axis=1)
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
|
||||
return y_mean, y_std
|
||||
Binary file not shown.
@@ -0,0 +1,961 @@
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
# Olivier Grisel <olivier.grisel@ensta.org>
|
||||
# Alexis Mignon <alexis.mignon@gmail.com>
|
||||
# Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from libc.math cimport fabs
|
||||
import numpy as np
|
||||
|
||||
from cython cimport floating
|
||||
import warnings
|
||||
from ..exceptions import ConvergenceWarning
|
||||
|
||||
from ..utils._cython_blas cimport (
|
||||
_axpy, _dot, _asum, _gemv, _nrm2, _copy, _scal
|
||||
)
|
||||
from ..utils._cython_blas cimport ColMajor, Trans, NoTrans
|
||||
from ..utils._typedefs cimport uint32_t
|
||||
from ..utils._random cimport our_rand_r
|
||||
|
||||
|
||||
# The following two functions are shamelessly copied from the tree code.
|
||||
|
||||
cdef enum:
|
||||
# Max value for our rand_r replacement (near the bottom).
|
||||
# We don't use RAND_MAX because it's different across platforms and
|
||||
# particularly tiny on Windows/MSVC.
|
||||
# It corresponds to the maximum representable value for
|
||||
# 32-bit signed integers (i.e. 2^31 - 1).
|
||||
RAND_R_MAX = 2147483647
|
||||
|
||||
|
||||
cdef inline uint32_t rand_int(uint32_t end, uint32_t* random_state) noexcept nogil:
|
||||
"""Generate a random integer in [0; end)."""
|
||||
return our_rand_r(random_state) % end
|
||||
|
||||
|
||||
cdef inline floating fmax(floating x, floating y) noexcept nogil:
|
||||
if x > y:
|
||||
return x
|
||||
return y
|
||||
|
||||
|
||||
cdef inline floating fsign(floating f) noexcept nogil:
|
||||
if f == 0:
|
||||
return 0
|
||||
elif f > 0:
|
||||
return 1.0
|
||||
else:
|
||||
return -1.0
|
||||
|
||||
|
||||
cdef floating abs_max(int n, const floating* a) noexcept nogil:
|
||||
"""np.max(np.abs(a))"""
|
||||
cdef int i
|
||||
cdef floating m = fabs(a[0])
|
||||
cdef floating d
|
||||
for i in range(1, n):
|
||||
d = fabs(a[i])
|
||||
if d > m:
|
||||
m = d
|
||||
return m
|
||||
|
||||
|
||||
cdef floating max(int n, floating* a) noexcept nogil:
|
||||
"""np.max(a)"""
|
||||
cdef int i
|
||||
cdef floating m = a[0]
|
||||
cdef floating d
|
||||
for i in range(1, n):
|
||||
d = a[i]
|
||||
if d > m:
|
||||
m = d
|
||||
return m
|
||||
|
||||
|
||||
cdef floating diff_abs_max(int n, const floating* a, floating* b) noexcept nogil:
|
||||
"""np.max(np.abs(a - b))"""
|
||||
cdef int i
|
||||
cdef floating m = fabs(a[0] - b[0])
|
||||
cdef floating d
|
||||
for i in range(1, n):
|
||||
d = fabs(a[i] - b[i])
|
||||
if d > m:
|
||||
m = d
|
||||
return m
|
||||
|
||||
|
||||
def enet_coordinate_descent(
|
||||
floating[::1] w,
|
||||
floating alpha,
|
||||
floating beta,
|
||||
const floating[::1, :] X,
|
||||
const floating[::1] y,
|
||||
unsigned int max_iter,
|
||||
floating tol,
|
||||
object rng,
|
||||
bint random=0,
|
||||
bint positive=0
|
||||
):
|
||||
"""Cython version of the coordinate descent algorithm
|
||||
for Elastic-Net regression
|
||||
|
||||
We minimize
|
||||
|
||||
(1/2) * norm(y - X w, 2)^2 + alpha norm(w, 1) + (beta/2) norm(w, 2)^2
|
||||
|
||||
Returns
|
||||
-------
|
||||
w : ndarray of shape (n_features,)
|
||||
ElasticNet coefficients.
|
||||
gap : float
|
||||
Achieved dual gap.
|
||||
tol : float
|
||||
Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
|
||||
n_iter : int
|
||||
Number of coordinate descent iterations.
|
||||
"""
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
# get the data information into easy vars
|
||||
cdef unsigned int n_samples = X.shape[0]
|
||||
cdef unsigned int n_features = X.shape[1]
|
||||
|
||||
# compute norms of the columns of X
|
||||
cdef floating[::1] norm_cols_X = np.square(X).sum(axis=0)
|
||||
|
||||
# initial value of the residuals
|
||||
cdef floating[::1] R = np.empty(n_samples, dtype=dtype)
|
||||
cdef floating[::1] XtA = np.empty(n_features, dtype=dtype)
|
||||
|
||||
cdef floating tmp
|
||||
cdef floating w_ii
|
||||
cdef floating d_w_max
|
||||
cdef floating w_max
|
||||
cdef floating d_w_ii
|
||||
cdef floating gap = tol + 1.0
|
||||
cdef floating d_w_tol = tol
|
||||
cdef floating dual_norm_XtA
|
||||
cdef floating R_norm2
|
||||
cdef floating w_norm2
|
||||
cdef floating l1_norm
|
||||
cdef floating const
|
||||
cdef floating A_norm2
|
||||
cdef unsigned int ii
|
||||
cdef unsigned int n_iter = 0
|
||||
cdef unsigned int f_iter
|
||||
cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
|
||||
cdef uint32_t* rand_r_state = &rand_r_state_seed
|
||||
|
||||
if alpha == 0 and beta == 0:
|
||||
warnings.warn("Coordinate descent with no regularization may lead to "
|
||||
"unexpected results and is discouraged.")
|
||||
|
||||
with nogil:
|
||||
# R = y - np.dot(X, w)
|
||||
_copy(n_samples, &y[0], 1, &R[0], 1)
|
||||
_gemv(ColMajor, NoTrans, n_samples, n_features, -1.0, &X[0, 0],
|
||||
n_samples, &w[0], 1, 1.0, &R[0], 1)
|
||||
|
||||
# tol *= np.dot(y, y)
|
||||
tol *= _dot(n_samples, &y[0], 1, &y[0], 1)
|
||||
|
||||
for n_iter in range(max_iter):
|
||||
w_max = 0.0
|
||||
d_w_max = 0.0
|
||||
for f_iter in range(n_features): # Loop over coordinates
|
||||
if random:
|
||||
ii = rand_int(n_features, rand_r_state)
|
||||
else:
|
||||
ii = f_iter
|
||||
|
||||
if norm_cols_X[ii] == 0.0:
|
||||
continue
|
||||
|
||||
w_ii = w[ii] # Store previous value
|
||||
|
||||
if w_ii != 0.0:
|
||||
# R += w_ii * X[:,ii]
|
||||
_axpy(n_samples, w_ii, &X[0, ii], 1, &R[0], 1)
|
||||
|
||||
# tmp = (X[:,ii]*R).sum()
|
||||
tmp = _dot(n_samples, &X[0, ii], 1, &R[0], 1)
|
||||
|
||||
if positive and tmp < 0:
|
||||
w[ii] = 0.0
|
||||
else:
|
||||
w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
|
||||
/ (norm_cols_X[ii] + beta))
|
||||
|
||||
if w[ii] != 0.0:
|
||||
# R -= w[ii] * X[:,ii] # Update residual
|
||||
_axpy(n_samples, -w[ii], &X[0, ii], 1, &R[0], 1)
|
||||
|
||||
# update the maximum absolute coefficient update
|
||||
d_w_ii = fabs(w[ii] - w_ii)
|
||||
d_w_max = fmax(d_w_max, d_w_ii)
|
||||
|
||||
w_max = fmax(w_max, fabs(w[ii]))
|
||||
|
||||
if (
|
||||
w_max == 0.0
|
||||
or d_w_max / w_max < d_w_tol
|
||||
or n_iter == max_iter - 1
|
||||
):
|
||||
# the biggest coordinate update of this iteration was smaller
|
||||
# than the tolerance: check the duality gap as ultimate
|
||||
# stopping criterion
|
||||
|
||||
# XtA = np.dot(X.T, R) - beta * w
|
||||
_copy(n_features, &w[0], 1, &XtA[0], 1)
|
||||
_gemv(ColMajor, Trans,
|
||||
n_samples, n_features, 1.0, &X[0, 0], n_samples,
|
||||
&R[0], 1,
|
||||
-beta, &XtA[0], 1)
|
||||
|
||||
if positive:
|
||||
dual_norm_XtA = max(n_features, &XtA[0])
|
||||
else:
|
||||
dual_norm_XtA = abs_max(n_features, &XtA[0])
|
||||
|
||||
# R_norm2 = np.dot(R, R)
|
||||
R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
|
||||
|
||||
# w_norm2 = np.dot(w, w)
|
||||
w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
|
||||
|
||||
if (dual_norm_XtA > alpha):
|
||||
const = alpha / dual_norm_XtA
|
||||
A_norm2 = R_norm2 * (const ** 2)
|
||||
gap = 0.5 * (R_norm2 + A_norm2)
|
||||
else:
|
||||
const = 1.0
|
||||
gap = R_norm2
|
||||
|
||||
l1_norm = _asum(n_features, &w[0], 1)
|
||||
|
||||
# np.dot(R.T, y)
|
||||
gap += (alpha * l1_norm
|
||||
- const * _dot(n_samples, &R[0], 1, &y[0], 1)
|
||||
+ 0.5 * beta * (1 + const ** 2) * (w_norm2))
|
||||
|
||||
if gap < tol:
|
||||
# return if we reached desired tolerance
|
||||
break
|
||||
|
||||
else:
|
||||
# for/else, runs if for doesn't end with a `break`
|
||||
with gil:
|
||||
message = (
|
||||
"Objective did not converge. You might want to increase "
|
||||
"the number of iterations, check the scale of the "
|
||||
"features or consider increasing regularisation. "
|
||||
f"Duality gap: {gap:.3e}, tolerance: {tol:.3e}"
|
||||
)
|
||||
if alpha < np.finfo(np.float64).eps:
|
||||
message += (
|
||||
" Linear regression models with null weight for the "
|
||||
"l1 regularization term are more efficiently fitted "
|
||||
"using one of the solvers implemented in "
|
||||
"sklearn.linear_model.Ridge/RidgeCV instead."
|
||||
)
|
||||
warnings.warn(message, ConvergenceWarning)
|
||||
|
||||
return np.asarray(w), gap, tol, n_iter + 1
|
||||
|
||||
|
||||
def sparse_enet_coordinate_descent(
|
||||
floating[::1] w,
|
||||
floating alpha,
|
||||
floating beta,
|
||||
const floating[::1] X_data,
|
||||
const int[::1] X_indices,
|
||||
const int[::1] X_indptr,
|
||||
const floating[::1] y,
|
||||
const floating[::1] sample_weight,
|
||||
const floating[::1] X_mean,
|
||||
unsigned int max_iter,
|
||||
floating tol,
|
||||
object rng,
|
||||
bint random=0,
|
||||
bint positive=0,
|
||||
):
|
||||
"""Cython version of the coordinate descent algorithm for Elastic-Net
|
||||
|
||||
We minimize:
|
||||
|
||||
1/2 * norm(y - Z w, 2)^2 + alpha * norm(w, 1) + (beta/2) * norm(w, 2)^2
|
||||
|
||||
where Z = X - X_mean.
|
||||
With sample weights sw, this becomes
|
||||
|
||||
1/2 * sum(sw * (y - Z w)^2, axis=0) + alpha * norm(w, 1)
|
||||
+ (beta/2) * norm(w, 2)^2
|
||||
|
||||
and X_mean is the weighted average of X (per column).
|
||||
|
||||
Returns
|
||||
-------
|
||||
w : ndarray of shape (n_features,)
|
||||
ElasticNet coefficients.
|
||||
gap : float
|
||||
Achieved dual gap.
|
||||
tol : float
|
||||
Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
|
||||
n_iter : int
|
||||
Number of coordinate descent iterations.
|
||||
"""
|
||||
# Notes for sample_weight:
|
||||
# For dense X, one centers X and y and then rescales them by sqrt(sample_weight).
|
||||
# Here, for sparse X, we get the sample_weight averaged center X_mean. We take care
|
||||
# that every calculation results as if we had rescaled y and X (and therefore also
|
||||
# X_mean) by sqrt(sample_weight) without actually calculating the square root.
|
||||
# We work with:
|
||||
# yw = sample_weight
|
||||
# R = sample_weight * residual
|
||||
# norm_cols_X = np.sum(sample_weight * (X - X_mean)**2, axis=0)
|
||||
|
||||
# get the data information into easy vars
|
||||
cdef unsigned int n_samples = y.shape[0]
|
||||
cdef unsigned int n_features = w.shape[0]
|
||||
|
||||
# compute norms of the columns of X
|
||||
cdef unsigned int ii
|
||||
cdef floating[:] norm_cols_X
|
||||
|
||||
cdef unsigned int startptr = X_indptr[0]
|
||||
cdef unsigned int endptr
|
||||
|
||||
# initial value of the residuals
|
||||
# R = y - Zw, weighted version R = sample_weight * (y - Zw)
|
||||
cdef floating[::1] R
|
||||
cdef floating[::1] XtA
|
||||
cdef const floating[::1] yw
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
norm_cols_X = np.zeros(n_features, dtype=dtype)
|
||||
XtA = np.zeros(n_features, dtype=dtype)
|
||||
|
||||
cdef floating tmp
|
||||
cdef floating w_ii
|
||||
cdef floating d_w_max
|
||||
cdef floating w_max
|
||||
cdef floating d_w_ii
|
||||
cdef floating X_mean_ii
|
||||
cdef floating R_sum = 0.0
|
||||
cdef floating R_norm2
|
||||
cdef floating w_norm2
|
||||
cdef floating A_norm2
|
||||
cdef floating l1_norm
|
||||
cdef floating normalize_sum
|
||||
cdef floating gap = tol + 1.0
|
||||
cdef floating d_w_tol = tol
|
||||
cdef floating dual_norm_XtA
|
||||
cdef unsigned int jj
|
||||
cdef unsigned int n_iter = 0
|
||||
cdef unsigned int f_iter
|
||||
cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
|
||||
cdef uint32_t* rand_r_state = &rand_r_state_seed
|
||||
cdef bint center = False
|
||||
cdef bint no_sample_weights = sample_weight is None
|
||||
cdef int kk
|
||||
|
||||
if no_sample_weights:
|
||||
yw = y
|
||||
R = y.copy()
|
||||
else:
|
||||
yw = np.multiply(sample_weight, y)
|
||||
R = yw.copy()
|
||||
|
||||
with nogil:
|
||||
# center = (X_mean != 0).any()
|
||||
for ii in range(n_features):
|
||||
if X_mean[ii]:
|
||||
center = True
|
||||
break
|
||||
|
||||
for ii in range(n_features):
|
||||
X_mean_ii = X_mean[ii]
|
||||
endptr = X_indptr[ii + 1]
|
||||
normalize_sum = 0.0
|
||||
w_ii = w[ii]
|
||||
|
||||
if no_sample_weights:
|
||||
for jj in range(startptr, endptr):
|
||||
normalize_sum += (X_data[jj] - X_mean_ii) ** 2
|
||||
R[X_indices[jj]] -= X_data[jj] * w_ii
|
||||
norm_cols_X[ii] = normalize_sum + \
|
||||
(n_samples - endptr + startptr) * X_mean_ii ** 2
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
R[jj] += X_mean_ii * w_ii
|
||||
else:
|
||||
for jj in range(startptr, endptr):
|
||||
tmp = sample_weight[X_indices[jj]]
|
||||
# second term will be subtracted by loop over range(n_samples)
|
||||
normalize_sum += (tmp * (X_data[jj] - X_mean_ii) ** 2
|
||||
- tmp * X_mean_ii ** 2)
|
||||
R[X_indices[jj]] -= tmp * X_data[jj] * w_ii
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
normalize_sum += sample_weight[jj] * X_mean_ii ** 2
|
||||
R[jj] += sample_weight[jj] * X_mean_ii * w_ii
|
||||
norm_cols_X[ii] = normalize_sum
|
||||
startptr = endptr
|
||||
|
||||
# tol *= np.dot(y, y)
|
||||
# with sample weights: tol *= y @ (sw * y)
|
||||
tol *= _dot(n_samples, &y[0], 1, &yw[0], 1)
|
||||
|
||||
for n_iter in range(max_iter):
|
||||
|
||||
w_max = 0.0
|
||||
d_w_max = 0.0
|
||||
|
||||
for f_iter in range(n_features): # Loop over coordinates
|
||||
if random:
|
||||
ii = rand_int(n_features, rand_r_state)
|
||||
else:
|
||||
ii = f_iter
|
||||
|
||||
if norm_cols_X[ii] == 0.0:
|
||||
continue
|
||||
|
||||
startptr = X_indptr[ii]
|
||||
endptr = X_indptr[ii + 1]
|
||||
w_ii = w[ii] # Store previous value
|
||||
X_mean_ii = X_mean[ii]
|
||||
|
||||
if w_ii != 0.0:
|
||||
# R += w_ii * X[:,ii]
|
||||
if no_sample_weights:
|
||||
for jj in range(startptr, endptr):
|
||||
R[X_indices[jj]] += X_data[jj] * w_ii
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
R[jj] -= X_mean_ii * w_ii
|
||||
else:
|
||||
for jj in range(startptr, endptr):
|
||||
tmp = sample_weight[X_indices[jj]]
|
||||
R[X_indices[jj]] += tmp * X_data[jj] * w_ii
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
R[jj] -= sample_weight[jj] * X_mean_ii * w_ii
|
||||
|
||||
# tmp = (X[:,ii] * R).sum()
|
||||
tmp = 0.0
|
||||
for jj in range(startptr, endptr):
|
||||
tmp += R[X_indices[jj]] * X_data[jj]
|
||||
|
||||
if center:
|
||||
R_sum = 0.0
|
||||
for jj in range(n_samples):
|
||||
R_sum += R[jj]
|
||||
tmp -= R_sum * X_mean_ii
|
||||
|
||||
if positive and tmp < 0.0:
|
||||
w[ii] = 0.0
|
||||
else:
|
||||
w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
|
||||
/ (norm_cols_X[ii] + beta)
|
||||
|
||||
if w[ii] != 0.0:
|
||||
# R -= w[ii] * X[:,ii] # Update residual
|
||||
if no_sample_weights:
|
||||
for jj in range(startptr, endptr):
|
||||
R[X_indices[jj]] -= X_data[jj] * w[ii]
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
R[jj] += X_mean_ii * w[ii]
|
||||
else:
|
||||
for jj in range(startptr, endptr):
|
||||
tmp = sample_weight[X_indices[jj]]
|
||||
R[X_indices[jj]] -= tmp * X_data[jj] * w[ii]
|
||||
if center:
|
||||
for jj in range(n_samples):
|
||||
R[jj] += sample_weight[jj] * X_mean_ii * w[ii]
|
||||
|
||||
# update the maximum absolute coefficient update
|
||||
d_w_ii = fabs(w[ii] - w_ii)
|
||||
d_w_max = fmax(d_w_max, d_w_ii)
|
||||
|
||||
w_max = fmax(w_max, fabs(w[ii]))
|
||||
|
||||
if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
|
||||
# the biggest coordinate update of this iteration was smaller than
|
||||
# the tolerance: check the duality gap as ultimate stopping
|
||||
# criterion
|
||||
|
||||
# sparse X.T / dense R dot product
|
||||
if center:
|
||||
R_sum = 0.0
|
||||
for jj in range(n_samples):
|
||||
R_sum += R[jj]
|
||||
|
||||
# XtA = X.T @ R - beta * w
|
||||
for ii in range(n_features):
|
||||
XtA[ii] = 0.0
|
||||
for kk in range(X_indptr[ii], X_indptr[ii + 1]):
|
||||
XtA[ii] += X_data[kk] * R[X_indices[kk]]
|
||||
|
||||
if center:
|
||||
XtA[ii] -= X_mean[ii] * R_sum
|
||||
XtA[ii] -= beta * w[ii]
|
||||
|
||||
if positive:
|
||||
dual_norm_XtA = max(n_features, &XtA[0])
|
||||
else:
|
||||
dual_norm_XtA = abs_max(n_features, &XtA[0])
|
||||
|
||||
# R_norm2 = np.dot(R, R)
|
||||
if no_sample_weights:
|
||||
R_norm2 = _dot(n_samples, &R[0], 1, &R[0], 1)
|
||||
else:
|
||||
R_norm2 = 0.0
|
||||
for jj in range(n_samples):
|
||||
# R is already multiplied by sample_weight
|
||||
if sample_weight[jj] != 0:
|
||||
R_norm2 += (R[jj] ** 2) / sample_weight[jj]
|
||||
|
||||
# w_norm2 = np.dot(w, w)
|
||||
w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
|
||||
if (dual_norm_XtA > alpha):
|
||||
const = alpha / dual_norm_XtA
|
||||
A_norm2 = R_norm2 * const**2
|
||||
gap = 0.5 * (R_norm2 + A_norm2)
|
||||
else:
|
||||
const = 1.0
|
||||
gap = R_norm2
|
||||
|
||||
l1_norm = _asum(n_features, &w[0], 1)
|
||||
|
||||
gap += (alpha * l1_norm - const * _dot(
|
||||
n_samples,
|
||||
&R[0], 1,
|
||||
&y[0], 1
|
||||
)
|
||||
+ 0.5 * beta * (1 + const ** 2) * w_norm2)
|
||||
|
||||
if gap < tol:
|
||||
# return if we reached desired tolerance
|
||||
break
|
||||
|
||||
else:
|
||||
# for/else, runs if for doesn't end with a `break`
|
||||
with gil:
|
||||
warnings.warn("Objective did not converge. You might want to "
|
||||
"increase the number of iterations. Duality "
|
||||
"gap: {}, tolerance: {}".format(gap, tol),
|
||||
ConvergenceWarning)
|
||||
|
||||
return np.asarray(w), gap, tol, n_iter + 1
|
||||
|
||||
|
||||
def enet_coordinate_descent_gram(
|
||||
floating[::1] w,
|
||||
floating alpha,
|
||||
floating beta,
|
||||
const floating[:, ::1] Q,
|
||||
const floating[::1] q,
|
||||
const floating[:] y,
|
||||
unsigned int max_iter,
|
||||
floating tol,
|
||||
object rng,
|
||||
bint random=0,
|
||||
bint positive=0
|
||||
):
|
||||
"""Cython version of the coordinate descent algorithm
|
||||
for Elastic-Net regression
|
||||
|
||||
We minimize
|
||||
|
||||
(1/2) * w^T Q w - q^T w + alpha norm(w, 1) + (beta/2) * norm(w, 2)^2
|
||||
|
||||
which amount to the Elastic-Net problem when:
|
||||
Q = X^T X (Gram matrix)
|
||||
q = X^T y
|
||||
|
||||
Returns
|
||||
-------
|
||||
w : ndarray of shape (n_features,)
|
||||
ElasticNet coefficients.
|
||||
gap : float
|
||||
Achieved dual gap.
|
||||
tol : float
|
||||
Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
|
||||
n_iter : int
|
||||
Number of coordinate descent iterations.
|
||||
"""
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
# get the data information into easy vars
|
||||
cdef unsigned int n_features = Q.shape[0]
|
||||
|
||||
# initial value "Q w" which will be kept of up to date in the iterations
|
||||
cdef floating[:] H = np.dot(Q, w)
|
||||
|
||||
cdef floating[:] XtA = np.zeros(n_features, dtype=dtype)
|
||||
cdef floating tmp
|
||||
cdef floating w_ii
|
||||
cdef floating d_w_max
|
||||
cdef floating w_max
|
||||
cdef floating d_w_ii
|
||||
cdef floating q_dot_w
|
||||
cdef floating w_norm2
|
||||
cdef floating gap = tol + 1.0
|
||||
cdef floating d_w_tol = tol
|
||||
cdef floating dual_norm_XtA
|
||||
cdef unsigned int ii
|
||||
cdef unsigned int n_iter = 0
|
||||
cdef unsigned int f_iter
|
||||
cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
|
||||
cdef uint32_t* rand_r_state = &rand_r_state_seed
|
||||
|
||||
cdef floating y_norm2 = np.dot(y, y)
|
||||
cdef floating* w_ptr = &w[0]
|
||||
cdef const floating* Q_ptr = &Q[0, 0]
|
||||
cdef const floating* q_ptr = &q[0]
|
||||
cdef floating* H_ptr = &H[0]
|
||||
cdef floating* XtA_ptr = &XtA[0]
|
||||
tol = tol * y_norm2
|
||||
|
||||
if alpha == 0:
|
||||
warnings.warn(
|
||||
"Coordinate descent without L1 regularization may "
|
||||
"lead to unexpected results and is discouraged. "
|
||||
"Set l1_ratio > 0 to add L1 regularization."
|
||||
)
|
||||
|
||||
with nogil:
|
||||
for n_iter in range(max_iter):
|
||||
w_max = 0.0
|
||||
d_w_max = 0.0
|
||||
for f_iter in range(n_features): # Loop over coordinates
|
||||
if random:
|
||||
ii = rand_int(n_features, rand_r_state)
|
||||
else:
|
||||
ii = f_iter
|
||||
|
||||
if Q[ii, ii] == 0.0:
|
||||
continue
|
||||
|
||||
w_ii = w[ii] # Store previous value
|
||||
|
||||
if w_ii != 0.0:
|
||||
# H -= w_ii * Q[ii]
|
||||
_axpy(n_features, -w_ii, Q_ptr + ii * n_features, 1,
|
||||
H_ptr, 1)
|
||||
|
||||
tmp = q[ii] - H[ii]
|
||||
|
||||
if positive and tmp < 0:
|
||||
w[ii] = 0.0
|
||||
else:
|
||||
w[ii] = fsign(tmp) * fmax(fabs(tmp) - alpha, 0) \
|
||||
/ (Q[ii, ii] + beta)
|
||||
|
||||
if w[ii] != 0.0:
|
||||
# H += w[ii] * Q[ii] # Update H = X.T X w
|
||||
_axpy(n_features, w[ii], Q_ptr + ii * n_features, 1,
|
||||
H_ptr, 1)
|
||||
|
||||
# update the maximum absolute coefficient update
|
||||
d_w_ii = fabs(w[ii] - w_ii)
|
||||
if d_w_ii > d_w_max:
|
||||
d_w_max = d_w_ii
|
||||
|
||||
if fabs(w[ii]) > w_max:
|
||||
w_max = fabs(w[ii])
|
||||
|
||||
if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
|
||||
# the biggest coordinate update of this iteration was smaller than
|
||||
# the tolerance: check the duality gap as ultimate stopping
|
||||
# criterion
|
||||
|
||||
# q_dot_w = np.dot(w, q)
|
||||
q_dot_w = _dot(n_features, w_ptr, 1, q_ptr, 1)
|
||||
|
||||
for ii in range(n_features):
|
||||
XtA[ii] = q[ii] - H[ii] - beta * w[ii]
|
||||
if positive:
|
||||
dual_norm_XtA = max(n_features, XtA_ptr)
|
||||
else:
|
||||
dual_norm_XtA = abs_max(n_features, XtA_ptr)
|
||||
|
||||
# temp = np.sum(w * H)
|
||||
tmp = 0.0
|
||||
for ii in range(n_features):
|
||||
tmp += w[ii] * H[ii]
|
||||
R_norm2 = y_norm2 + tmp - 2.0 * q_dot_w
|
||||
|
||||
# w_norm2 = np.dot(w, w)
|
||||
w_norm2 = _dot(n_features, &w[0], 1, &w[0], 1)
|
||||
|
||||
if (dual_norm_XtA > alpha):
|
||||
const = alpha / dual_norm_XtA
|
||||
A_norm2 = R_norm2 * (const ** 2)
|
||||
gap = 0.5 * (R_norm2 + A_norm2)
|
||||
else:
|
||||
const = 1.0
|
||||
gap = R_norm2
|
||||
|
||||
# The call to asum is equivalent to the L1 norm of w
|
||||
gap += (
|
||||
alpha * _asum(n_features, &w[0], 1)
|
||||
- const * y_norm2
|
||||
+ const * q_dot_w
|
||||
+ 0.5 * beta * (1 + const ** 2) * w_norm2
|
||||
)
|
||||
|
||||
if gap < tol:
|
||||
# return if we reached desired tolerance
|
||||
break
|
||||
|
||||
else:
|
||||
# for/else, runs if for doesn't end with a `break`
|
||||
with gil:
|
||||
warnings.warn("Objective did not converge. You might want to "
|
||||
"increase the number of iterations. Duality "
|
||||
"gap: {}, tolerance: {}".format(gap, tol),
|
||||
ConvergenceWarning)
|
||||
|
||||
return np.asarray(w), gap, tol, n_iter + 1
|
||||
|
||||
|
||||
def enet_coordinate_descent_multi_task(
|
||||
const floating[::1, :] W,
|
||||
floating l1_reg,
|
||||
floating l2_reg,
|
||||
const floating[::1, :] X,
|
||||
const floating[::1, :] Y,
|
||||
unsigned int max_iter,
|
||||
floating tol,
|
||||
object rng,
|
||||
bint random=0
|
||||
):
|
||||
"""Cython version of the coordinate descent algorithm
|
||||
for Elastic-Net mult-task regression
|
||||
|
||||
We minimize
|
||||
|
||||
0.5 * norm(Y - X W.T, 2)^2 + l1_reg ||W.T||_21 + 0.5 * l2_reg norm(W.T, 2)^2
|
||||
|
||||
Returns
|
||||
-------
|
||||
W : ndarray of shape (n_tasks, n_features)
|
||||
ElasticNet coefficients.
|
||||
gap : float
|
||||
Achieved dual gap.
|
||||
tol : float
|
||||
Equals input `tol` times `np.dot(y, y)`. The tolerance used for the dual gap.
|
||||
n_iter : int
|
||||
Number of coordinate descent iterations.
|
||||
"""
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
# get the data information into easy vars
|
||||
cdef unsigned int n_samples = X.shape[0]
|
||||
cdef unsigned int n_features = X.shape[1]
|
||||
cdef unsigned int n_tasks = Y.shape[1]
|
||||
|
||||
# to store XtA
|
||||
cdef floating[:, ::1] XtA = np.zeros((n_features, n_tasks), dtype=dtype)
|
||||
cdef floating XtA_axis1norm
|
||||
cdef floating dual_norm_XtA
|
||||
|
||||
# initial value of the residuals
|
||||
cdef floating[::1, :] R = np.zeros((n_samples, n_tasks), dtype=dtype, order='F')
|
||||
|
||||
cdef floating[::1] norm_cols_X = np.zeros(n_features, dtype=dtype)
|
||||
cdef floating[::1] tmp = np.zeros(n_tasks, dtype=dtype)
|
||||
cdef floating[::1] w_ii = np.zeros(n_tasks, dtype=dtype)
|
||||
cdef floating d_w_max
|
||||
cdef floating w_max
|
||||
cdef floating d_w_ii
|
||||
cdef floating nn
|
||||
cdef floating W_ii_abs_max
|
||||
cdef floating gap = tol + 1.0
|
||||
cdef floating d_w_tol = tol
|
||||
cdef floating R_norm
|
||||
cdef floating w_norm
|
||||
cdef floating ry_sum
|
||||
cdef floating l21_norm
|
||||
cdef unsigned int ii
|
||||
cdef unsigned int jj
|
||||
cdef unsigned int n_iter = 0
|
||||
cdef unsigned int f_iter
|
||||
cdef uint32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
|
||||
cdef uint32_t* rand_r_state = &rand_r_state_seed
|
||||
|
||||
cdef const floating* X_ptr = &X[0, 0]
|
||||
cdef const floating* Y_ptr = &Y[0, 0]
|
||||
|
||||
if l1_reg == 0:
|
||||
warnings.warn(
|
||||
"Coordinate descent with l1_reg=0 may lead to unexpected"
|
||||
" results and is discouraged."
|
||||
)
|
||||
|
||||
with nogil:
|
||||
# norm_cols_X = (np.asarray(X) ** 2).sum(axis=0)
|
||||
for ii in range(n_features):
|
||||
norm_cols_X[ii] = _nrm2(n_samples, X_ptr + ii * n_samples, 1) ** 2
|
||||
|
||||
# R = Y - np.dot(X, W.T)
|
||||
_copy(n_samples * n_tasks, Y_ptr, 1, &R[0, 0], 1)
|
||||
for ii in range(n_features):
|
||||
for jj in range(n_tasks):
|
||||
if W[jj, ii] != 0:
|
||||
_axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
|
||||
&R[0, jj], 1)
|
||||
|
||||
# tol = tol * linalg.norm(Y, ord='fro') ** 2
|
||||
tol = tol * _nrm2(n_samples * n_tasks, Y_ptr, 1) ** 2
|
||||
|
||||
for n_iter in range(max_iter):
|
||||
w_max = 0.0
|
||||
d_w_max = 0.0
|
||||
for f_iter in range(n_features): # Loop over coordinates
|
||||
if random:
|
||||
ii = rand_int(n_features, rand_r_state)
|
||||
else:
|
||||
ii = f_iter
|
||||
|
||||
if norm_cols_X[ii] == 0.0:
|
||||
continue
|
||||
|
||||
# w_ii = W[:, ii] # Store previous value
|
||||
_copy(n_tasks, &W[0, ii], 1, &w_ii[0], 1)
|
||||
|
||||
# Using Numpy:
|
||||
# R += np.dot(X[:, ii][:, None], w_ii[None, :]) # rank 1 update
|
||||
# Using Blas Level2:
|
||||
# _ger(RowMajor, n_samples, n_tasks, 1.0,
|
||||
# &X[0, ii], 1,
|
||||
# &w_ii[0], 1, &R[0, 0], n_tasks)
|
||||
# Using Blas Level1 and for loop to avoid slower threads
|
||||
# for such small vectors
|
||||
for jj in range(n_tasks):
|
||||
if w_ii[jj] != 0:
|
||||
_axpy(n_samples, w_ii[jj], X_ptr + ii * n_samples, 1,
|
||||
&R[0, jj], 1)
|
||||
|
||||
# Using numpy:
|
||||
# tmp = np.dot(X[:, ii][None, :], R).ravel()
|
||||
# Using BLAS Level 2:
|
||||
# _gemv(RowMajor, Trans, n_samples, n_tasks, 1.0, &R[0, 0],
|
||||
# n_tasks, &X[0, ii], 1, 0.0, &tmp[0], 1)
|
||||
# Using BLAS Level 1 (faster for small vectors like here):
|
||||
for jj in range(n_tasks):
|
||||
tmp[jj] = _dot(n_samples, X_ptr + ii * n_samples, 1,
|
||||
&R[0, jj], 1)
|
||||
|
||||
# nn = sqrt(np.sum(tmp ** 2))
|
||||
nn = _nrm2(n_tasks, &tmp[0], 1)
|
||||
|
||||
# W[:, ii] = tmp * fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg)
|
||||
_copy(n_tasks, &tmp[0], 1, &W[0, ii], 1)
|
||||
_scal(n_tasks, fmax(1. - l1_reg / nn, 0) / (norm_cols_X[ii] + l2_reg),
|
||||
&W[0, ii], 1)
|
||||
|
||||
# Using numpy:
|
||||
# R -= np.dot(X[:, ii][:, None], W[:, ii][None, :])
|
||||
# Using BLAS Level 2:
|
||||
# Update residual : rank 1 update
|
||||
# _ger(RowMajor, n_samples, n_tasks, -1.0,
|
||||
# &X[0, ii], 1, &W[0, ii], 1,
|
||||
# &R[0, 0], n_tasks)
|
||||
# Using BLAS Level 1 (faster for small vectors like here):
|
||||
for jj in range(n_tasks):
|
||||
if W[jj, ii] != 0:
|
||||
_axpy(n_samples, -W[jj, ii], X_ptr + ii * n_samples, 1,
|
||||
&R[0, jj], 1)
|
||||
|
||||
# update the maximum absolute coefficient update
|
||||
d_w_ii = diff_abs_max(n_tasks, &W[0, ii], &w_ii[0])
|
||||
|
||||
if d_w_ii > d_w_max:
|
||||
d_w_max = d_w_ii
|
||||
|
||||
W_ii_abs_max = abs_max(n_tasks, &W[0, ii])
|
||||
if W_ii_abs_max > w_max:
|
||||
w_max = W_ii_abs_max
|
||||
|
||||
if w_max == 0.0 or d_w_max / w_max < d_w_tol or n_iter == max_iter - 1:
|
||||
# the biggest coordinate update of this iteration was smaller than
|
||||
# the tolerance: check the duality gap as ultimate stopping
|
||||
# criterion
|
||||
|
||||
# XtA = np.dot(X.T, R) - l2_reg * W.T
|
||||
for ii in range(n_features):
|
||||
for jj in range(n_tasks):
|
||||
XtA[ii, jj] = _dot(
|
||||
n_samples, X_ptr + ii * n_samples, 1, &R[0, jj], 1
|
||||
) - l2_reg * W[jj, ii]
|
||||
|
||||
# dual_norm_XtA = np.max(np.sqrt(np.sum(XtA ** 2, axis=1)))
|
||||
dual_norm_XtA = 0.0
|
||||
for ii in range(n_features):
|
||||
# np.sqrt(np.sum(XtA ** 2, axis=1))
|
||||
XtA_axis1norm = _nrm2(n_tasks, &XtA[ii, 0], 1)
|
||||
if XtA_axis1norm > dual_norm_XtA:
|
||||
dual_norm_XtA = XtA_axis1norm
|
||||
|
||||
# TODO: use squared L2 norm directly
|
||||
# R_norm = linalg.norm(R, ord='fro')
|
||||
# w_norm = linalg.norm(W, ord='fro')
|
||||
R_norm = _nrm2(n_samples * n_tasks, &R[0, 0], 1)
|
||||
w_norm = _nrm2(n_features * n_tasks, &W[0, 0], 1)
|
||||
if (dual_norm_XtA > l1_reg):
|
||||
const = l1_reg / dual_norm_XtA
|
||||
A_norm = R_norm * const
|
||||
gap = 0.5 * (R_norm ** 2 + A_norm ** 2)
|
||||
else:
|
||||
const = 1.0
|
||||
gap = R_norm ** 2
|
||||
|
||||
# ry_sum = np.sum(R * y)
|
||||
ry_sum = _dot(n_samples * n_tasks, &R[0, 0], 1, &Y[0, 0], 1)
|
||||
|
||||
# l21_norm = np.sqrt(np.sum(W ** 2, axis=0)).sum()
|
||||
l21_norm = 0.0
|
||||
for ii in range(n_features):
|
||||
l21_norm += _nrm2(n_tasks, &W[0, ii], 1)
|
||||
|
||||
gap += (
|
||||
l1_reg * l21_norm
|
||||
- const * ry_sum
|
||||
+ 0.5 * l2_reg * (1 + const ** 2) * (w_norm ** 2)
|
||||
)
|
||||
|
||||
if gap < tol:
|
||||
# return if we reached desired tolerance
|
||||
break
|
||||
else:
|
||||
# for/else, runs if for doesn't end with a `break`
|
||||
with gil:
|
||||
warnings.warn("Objective did not converge. You might want to "
|
||||
"increase the number of iterations. Duality "
|
||||
"gap: {}, tolerance: {}".format(gap, tol),
|
||||
ConvergenceWarning)
|
||||
|
||||
return np.asarray(W), gap, tol, n_iter + 1
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
from .glm import (
|
||||
GammaRegressor,
|
||||
PoissonRegressor,
|
||||
TweedieRegressor,
|
||||
_GeneralizedLinearRegressor,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"_GeneralizedLinearRegressor",
|
||||
"PoissonRegressor",
|
||||
"GammaRegressor",
|
||||
"TweedieRegressor",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,523 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
"""
|
||||
Newton solver for Generalized Linear Models
|
||||
"""
|
||||
|
||||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
import scipy.linalg
|
||||
import scipy.optimize
|
||||
|
||||
from ..._loss.loss import HalfSquaredError
|
||||
from ...exceptions import ConvergenceWarning
|
||||
from ...utils.optimize import _check_optimize_result
|
||||
from .._linear_loss import LinearModelLoss
|
||||
|
||||
|
||||
class NewtonSolver(ABC):
|
||||
"""Newton solver for GLMs.
|
||||
|
||||
This class implements Newton/2nd-order optimization routines for GLMs. Each Newton
|
||||
iteration aims at finding the Newton step which is done by the inner solver. With
|
||||
Hessian H, gradient g and coefficients coef, one step solves:
|
||||
|
||||
H @ coef_newton = -g
|
||||
|
||||
For our GLM / LinearModelLoss, we have gradient g and Hessian H:
|
||||
|
||||
g = X.T @ loss.gradient + l2_reg_strength * coef
|
||||
H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
|
||||
|
||||
Backtracking line search updates coef = coef_old + t * coef_newton for some t in
|
||||
(0, 1].
|
||||
|
||||
This is a base class, actual implementations (child classes) may deviate from the
|
||||
above pattern and use structure specific tricks.
|
||||
|
||||
Usage pattern:
|
||||
- initialize solver: sol = NewtonSolver(...)
|
||||
- solve the problem: sol.solve(X, y, sample_weight)
|
||||
|
||||
References
|
||||
----------
|
||||
- Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization"
|
||||
2nd edition
|
||||
https://doi.org/10.1007/978-0-387-40065-5
|
||||
|
||||
- Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization."
|
||||
Cambridge University Press, 2004.
|
||||
https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Initial coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
|
||||
linear_loss : LinearModelLoss
|
||||
The loss to be minimized.
|
||||
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength.
|
||||
|
||||
tol : float, default=1e-4
|
||||
The optimization problem is solved when each of the following condition is
|
||||
fulfilled:
|
||||
1. maximum |gradient| <= tol
|
||||
2. Newton decrement d: 1/2 * d^2 <= tol
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of Newton steps allowed.
|
||||
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use for the computation of the Hessian and gradient
|
||||
of the loss function.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_old : ndarray of shape coef.shape
|
||||
Coefficient of previous iteration.
|
||||
|
||||
coef_newton : ndarray of shape coef.shape
|
||||
Newton step.
|
||||
|
||||
gradient : ndarray of shape coef.shape
|
||||
Gradient of the loss w.r.t. the coefficients.
|
||||
|
||||
gradient_old : ndarray of shape coef.shape
|
||||
Gradient of previous iteration.
|
||||
|
||||
loss_value : float
|
||||
Value of objective function = loss + penalty.
|
||||
|
||||
loss_value_old : float
|
||||
Value of objective function of previous itertion.
|
||||
|
||||
raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes)
|
||||
|
||||
converged : bool
|
||||
Indicator for convergence of the solver.
|
||||
|
||||
iteration : int
|
||||
Number of Newton steps, i.e. calls to inner_solve
|
||||
|
||||
use_fallback_lbfgs_solve : bool
|
||||
If set to True, the solver will resort to call LBFGS to finish the optimisation
|
||||
procedure in case of convergence issues.
|
||||
|
||||
gradient_times_newton : float
|
||||
gradient @ coef_newton, set in inner_solve and used by line_search. If the
|
||||
Newton step is a descent direction, this is negative.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
coef,
|
||||
linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True),
|
||||
l2_reg_strength=0.0,
|
||||
tol=1e-4,
|
||||
max_iter=100,
|
||||
n_threads=1,
|
||||
verbose=0,
|
||||
):
|
||||
self.coef = coef
|
||||
self.linear_loss = linear_loss
|
||||
self.l2_reg_strength = l2_reg_strength
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.n_threads = n_threads
|
||||
self.verbose = verbose
|
||||
|
||||
def setup(self, X, y, sample_weight):
|
||||
"""Precomputations
|
||||
|
||||
If None, initializes:
|
||||
- self.coef
|
||||
Sets:
|
||||
- self.raw_prediction
|
||||
- self.loss_value
|
||||
"""
|
||||
_, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X)
|
||||
self.loss_value = self.linear_loss.loss(
|
||||
coef=self.coef,
|
||||
X=X,
|
||||
y=y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=self.l2_reg_strength,
|
||||
n_threads=self.n_threads,
|
||||
raw_prediction=self.raw_prediction,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def update_gradient_hessian(self, X, y, sample_weight):
|
||||
"""Update gradient and Hessian."""
|
||||
|
||||
@abstractmethod
|
||||
def inner_solve(self, X, y, sample_weight):
|
||||
"""Compute Newton step.
|
||||
|
||||
Sets:
|
||||
- self.coef_newton
|
||||
- self.gradient_times_newton
|
||||
"""
|
||||
|
||||
def fallback_lbfgs_solve(self, X, y, sample_weight):
|
||||
"""Fallback solver in case of emergency.
|
||||
|
||||
If a solver detects convergence problems, it may fall back to this methods in
|
||||
the hope to exit with success instead of raising an error.
|
||||
|
||||
Sets:
|
||||
- self.coef
|
||||
- self.converged
|
||||
"""
|
||||
opt_res = scipy.optimize.minimize(
|
||||
self.linear_loss.loss_gradient,
|
||||
self.coef,
|
||||
method="L-BFGS-B",
|
||||
jac=True,
|
||||
options={
|
||||
"maxiter": self.max_iter,
|
||||
"maxls": 50, # default is 20
|
||||
"iprint": self.verbose - 1,
|
||||
"gtol": self.tol,
|
||||
"ftol": 64 * np.finfo(np.float64).eps,
|
||||
},
|
||||
args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads),
|
||||
)
|
||||
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
|
||||
self.coef = opt_res.x
|
||||
self.converged = opt_res.status == 0
|
||||
|
||||
def line_search(self, X, y, sample_weight):
|
||||
"""Backtracking line search.
|
||||
|
||||
Sets:
|
||||
- self.coef_old
|
||||
- self.coef
|
||||
- self.loss_value_old
|
||||
- self.loss_value
|
||||
- self.gradient_old
|
||||
- self.gradient
|
||||
- self.raw_prediction
|
||||
"""
|
||||
# line search parameters
|
||||
beta, sigma = 0.5, 0.00048828125 # 1/2, 1/2**11
|
||||
eps = 16 * np.finfo(self.loss_value.dtype).eps
|
||||
t = 1 # step size
|
||||
|
||||
# gradient_times_newton = self.gradient @ self.coef_newton
|
||||
# was computed in inner_solve.
|
||||
armijo_term = sigma * self.gradient_times_newton
|
||||
_, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw(
|
||||
self.coef_newton, X
|
||||
)
|
||||
|
||||
self.coef_old = self.coef
|
||||
self.loss_value_old = self.loss_value
|
||||
self.gradient_old = self.gradient
|
||||
|
||||
# np.sum(np.abs(self.gradient_old))
|
||||
sum_abs_grad_old = -1
|
||||
|
||||
is_verbose = self.verbose >= 2
|
||||
if is_verbose:
|
||||
print(" Backtracking Line Search")
|
||||
print(f" eps=16 * finfo.eps={eps}")
|
||||
|
||||
for i in range(21): # until and including t = beta**20 ~ 1e-6
|
||||
self.coef = self.coef_old + t * self.coef_newton
|
||||
raw = self.raw_prediction + t * raw_prediction_newton
|
||||
self.loss_value, self.gradient = self.linear_loss.loss_gradient(
|
||||
coef=self.coef,
|
||||
X=X,
|
||||
y=y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=self.l2_reg_strength,
|
||||
n_threads=self.n_threads,
|
||||
raw_prediction=raw,
|
||||
)
|
||||
# Note: If coef_newton is too large, loss_gradient may produce inf values,
|
||||
# potentially accompanied by a RuntimeWarning.
|
||||
# This case will be captured by the Armijo condition.
|
||||
|
||||
# 1. Check Armijo / sufficient decrease condition.
|
||||
# The smaller (more negative) the better.
|
||||
loss_improvement = self.loss_value - self.loss_value_old
|
||||
check = loss_improvement <= t * armijo_term
|
||||
if is_verbose:
|
||||
print(
|
||||
f" line search iteration={i+1}, step size={t}\n"
|
||||
f" check loss improvement <= armijo term: {loss_improvement} "
|
||||
f"<= {t * armijo_term} {check}"
|
||||
)
|
||||
if check:
|
||||
break
|
||||
# 2. Deal with relative loss differences around machine precision.
|
||||
tiny_loss = np.abs(self.loss_value_old * eps)
|
||||
check = np.abs(loss_improvement) <= tiny_loss
|
||||
if is_verbose:
|
||||
print(
|
||||
" check loss |improvement| <= eps * |loss_old|:"
|
||||
f" {np.abs(loss_improvement)} <= {tiny_loss} {check}"
|
||||
)
|
||||
if check:
|
||||
if sum_abs_grad_old < 0:
|
||||
sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1)
|
||||
# 2.1 Check sum of absolute gradients as alternative condition.
|
||||
sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1)
|
||||
check = sum_abs_grad < sum_abs_grad_old
|
||||
if is_verbose:
|
||||
print(
|
||||
" check sum(|gradient|) < sum(|gradient_old|): "
|
||||
f"{sum_abs_grad} < {sum_abs_grad_old} {check}"
|
||||
)
|
||||
if check:
|
||||
break
|
||||
|
||||
t *= beta
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
f"Line search of Newton solver {self.__class__.__name__} at"
|
||||
f" iteration #{self.iteration} did no converge after 21 line search"
|
||||
" refinement iterations. It will now resort to lbfgs instead."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
if self.verbose:
|
||||
print(" Line search did not converge and resorts to lbfgs instead.")
|
||||
self.use_fallback_lbfgs_solve = True
|
||||
return
|
||||
|
||||
self.raw_prediction = raw
|
||||
|
||||
def check_convergence(self, X, y, sample_weight):
|
||||
"""Check for convergence.
|
||||
|
||||
Sets self.converged.
|
||||
"""
|
||||
if self.verbose:
|
||||
print(" Check Convergence")
|
||||
# Note: Checking maximum relative change of coefficient <= tol is a bad
|
||||
# convergence criterion because even a large step could have brought us close
|
||||
# to the true minimum.
|
||||
# coef_step = self.coef - self.coef_old
|
||||
# check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old)))
|
||||
|
||||
# 1. Criterion: maximum |gradient| <= tol
|
||||
# The gradient was already updated in line_search()
|
||||
check = np.max(np.abs(self.gradient))
|
||||
if self.verbose:
|
||||
print(f" 1. max |gradient| {check} <= {self.tol}")
|
||||
if check > self.tol:
|
||||
return
|
||||
|
||||
# 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol
|
||||
# d = sqrt(grad @ hessian^-1 @ grad)
|
||||
# = sqrt(coef_newton @ hessian @ coef_newton)
|
||||
# See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1.
|
||||
d2 = self.coef_newton @ self.hessian @ self.coef_newton
|
||||
if self.verbose:
|
||||
print(f" 2. Newton decrement {0.5 * d2} <= {self.tol}")
|
||||
if 0.5 * d2 > self.tol:
|
||||
return
|
||||
|
||||
if self.verbose:
|
||||
loss_value = self.linear_loss.loss(
|
||||
coef=self.coef,
|
||||
X=X,
|
||||
y=y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=self.l2_reg_strength,
|
||||
n_threads=self.n_threads,
|
||||
)
|
||||
print(f" Solver did converge at loss = {loss_value}.")
|
||||
self.converged = True
|
||||
|
||||
def finalize(self, X, y, sample_weight):
|
||||
"""Finalize the solvers results.
|
||||
|
||||
Some solvers may need this, others not.
|
||||
"""
|
||||
pass
|
||||
|
||||
def solve(self, X, y, sample_weight):
|
||||
"""Solve the optimization problem.
|
||||
|
||||
This is the main routine.
|
||||
|
||||
Order of calls:
|
||||
self.setup()
|
||||
while iteration:
|
||||
self.update_gradient_hessian()
|
||||
self.inner_solve()
|
||||
self.line_search()
|
||||
self.check_convergence()
|
||||
self.finalize()
|
||||
|
||||
Returns
|
||||
-------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Solution of the optimization problem.
|
||||
"""
|
||||
# setup usually:
|
||||
# - initializes self.coef if needed
|
||||
# - initializes and calculates self.raw_predictions, self.loss_value
|
||||
self.setup(X=X, y=y, sample_weight=sample_weight)
|
||||
|
||||
self.iteration = 1
|
||||
self.converged = False
|
||||
self.use_fallback_lbfgs_solve = False
|
||||
|
||||
while self.iteration <= self.max_iter and not self.converged:
|
||||
if self.verbose:
|
||||
print(f"Newton iter={self.iteration}")
|
||||
|
||||
self.use_fallback_lbfgs_solve = False # Fallback solver.
|
||||
|
||||
# 1. Update Hessian and gradient
|
||||
self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight)
|
||||
|
||||
# TODO:
|
||||
# if iteration == 1:
|
||||
# We might stop early, e.g. we already are close to the optimum,
|
||||
# usually detected by zero gradients at this stage.
|
||||
|
||||
# 2. Inner solver
|
||||
# Calculate Newton step/direction
|
||||
# This usually sets self.coef_newton and self.gradient_times_newton.
|
||||
self.inner_solve(X=X, y=y, sample_weight=sample_weight)
|
||||
if self.use_fallback_lbfgs_solve:
|
||||
break
|
||||
|
||||
# 3. Backtracking line search
|
||||
# This usually sets self.coef_old, self.coef, self.loss_value_old
|
||||
# self.loss_value, self.gradient_old, self.gradient,
|
||||
# self.raw_prediction.
|
||||
self.line_search(X=X, y=y, sample_weight=sample_weight)
|
||||
if self.use_fallback_lbfgs_solve:
|
||||
break
|
||||
|
||||
# 4. Check convergence
|
||||
# Sets self.converged.
|
||||
self.check_convergence(X=X, y=y, sample_weight=sample_weight)
|
||||
|
||||
# 5. Next iteration
|
||||
self.iteration += 1
|
||||
|
||||
if not self.converged:
|
||||
if self.use_fallback_lbfgs_solve:
|
||||
# Note: The fallback solver circumvents check_convergence and relies on
|
||||
# the convergence checks of lbfgs instead. Enough warnings have been
|
||||
# raised on the way.
|
||||
self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight)
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
f"Newton solver did not converge after {self.iteration - 1} "
|
||||
"iterations."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
|
||||
self.iteration -= 1
|
||||
self.finalize(X=X, y=y, sample_weight=sample_weight)
|
||||
return self.coef
|
||||
|
||||
|
||||
class NewtonCholeskySolver(NewtonSolver):
|
||||
"""Cholesky based Newton solver.
|
||||
|
||||
Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear
|
||||
solver.
|
||||
"""
|
||||
|
||||
def setup(self, X, y, sample_weight):
|
||||
super().setup(X=X, y=y, sample_weight=sample_weight)
|
||||
n_dof = X.shape[1]
|
||||
if self.linear_loss.fit_intercept:
|
||||
n_dof += 1
|
||||
self.gradient = np.empty_like(self.coef)
|
||||
self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof))
|
||||
|
||||
def update_gradient_hessian(self, X, y, sample_weight):
|
||||
_, _, self.hessian_warning = self.linear_loss.gradient_hessian(
|
||||
coef=self.coef,
|
||||
X=X,
|
||||
y=y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=self.l2_reg_strength,
|
||||
n_threads=self.n_threads,
|
||||
gradient_out=self.gradient,
|
||||
hessian_out=self.hessian,
|
||||
raw_prediction=self.raw_prediction, # this was updated in line_search
|
||||
)
|
||||
|
||||
def inner_solve(self, X, y, sample_weight):
|
||||
if self.hessian_warning:
|
||||
warnings.warn(
|
||||
(
|
||||
f"The inner solver of {self.__class__.__name__} detected a "
|
||||
"pointwise hessian with many negative values at iteration "
|
||||
f"#{self.iteration}. It will now resort to lbfgs instead."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
if self.verbose:
|
||||
print(
|
||||
" The inner solver detected a pointwise Hessian with many "
|
||||
"negative values and resorts to lbfgs instead."
|
||||
)
|
||||
self.use_fallback_lbfgs_solve = True
|
||||
return
|
||||
|
||||
try:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", scipy.linalg.LinAlgWarning)
|
||||
self.coef_newton = scipy.linalg.solve(
|
||||
self.hessian, -self.gradient, check_finite=False, assume_a="sym"
|
||||
)
|
||||
self.gradient_times_newton = self.gradient @ self.coef_newton
|
||||
if self.gradient_times_newton > 0:
|
||||
if self.verbose:
|
||||
print(
|
||||
" The inner solver found a Newton step that is not a "
|
||||
"descent direction and resorts to LBFGS steps instead."
|
||||
)
|
||||
self.use_fallback_lbfgs_solve = True
|
||||
return
|
||||
except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e:
|
||||
warnings.warn(
|
||||
f"The inner solver of {self.__class__.__name__} stumbled upon a "
|
||||
"singular or very ill-conditioned Hessian matrix at iteration "
|
||||
f"#{self.iteration}. It will now resort to lbfgs instead.\n"
|
||||
"Further options are to use another solver or to avoid such situation "
|
||||
"in the first place. Possible remedies are removing collinear features"
|
||||
" of X or increasing the penalization strengths.\n"
|
||||
"The original Linear Algebra message was:\n" + str(e),
|
||||
scipy.linalg.LinAlgWarning,
|
||||
)
|
||||
# Possible causes:
|
||||
# 1. hess_pointwise is negative. But this is already taken care in
|
||||
# LinearModelLoss.gradient_hessian.
|
||||
# 2. X is singular or ill-conditioned
|
||||
# This might be the most probable cause.
|
||||
#
|
||||
# There are many possible ways to deal with this situation. Most of them
|
||||
# add, explicitly or implicitly, a matrix to the hessian to make it
|
||||
# positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed.
|
||||
# Instead, we resort to lbfgs.
|
||||
if self.verbose:
|
||||
print(
|
||||
" The inner solver stumbled upon an singular or ill-conditioned "
|
||||
"Hessian matrix and resorts to LBFGS instead."
|
||||
)
|
||||
self.use_fallback_lbfgs_solve = True
|
||||
return
|
||||
@@ -0,0 +1,902 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
"""
|
||||
Generalized Linear Models with Exponential Dispersion Family
|
||||
"""
|
||||
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
import scipy.optimize
|
||||
|
||||
from ..._loss.loss import (
|
||||
HalfGammaLoss,
|
||||
HalfPoissonLoss,
|
||||
HalfSquaredError,
|
||||
HalfTweedieLoss,
|
||||
HalfTweedieLossIdentity,
|
||||
)
|
||||
from ...base import BaseEstimator, RegressorMixin, _fit_context
|
||||
from ...utils import check_array
|
||||
from ...utils._openmp_helpers import _openmp_effective_n_threads
|
||||
from ...utils._param_validation import Hidden, Interval, StrOptions
|
||||
from ...utils.optimize import _check_optimize_result
|
||||
from ...utils.validation import _check_sample_weight, check_is_fitted
|
||||
from .._linear_loss import LinearModelLoss
|
||||
from ._newton_solver import NewtonCholeskySolver, NewtonSolver
|
||||
|
||||
|
||||
class _GeneralizedLinearRegressor(RegressorMixin, BaseEstimator):
|
||||
"""Regression via a penalized Generalized Linear Model (GLM).
|
||||
|
||||
GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and
|
||||
predicting the mean of the target y as y_pred=h(X*w) with coefficients w.
|
||||
Therefore, the fit minimizes the following objective function with L2 priors as
|
||||
regularizer::
|
||||
|
||||
1/(2*sum(s_i)) * sum(s_i * deviance(y_i, h(x_i*w)) + 1/2 * alpha * ||w||_2^2
|
||||
|
||||
with inverse link function h, s=sample_weight and per observation (unit) deviance
|
||||
deviance(y_i, h(x_i*w)). Note that for an EDM, 1/2 * deviance is the negative
|
||||
log-likelihood up to a constant (in w) term.
|
||||
The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
|
||||
|
||||
Instead of implementing the EDM family and a link function separately, we directly
|
||||
use the loss functions `from sklearn._loss` which have the link functions included
|
||||
in them for performance reasons. We pick the loss functions that implement
|
||||
(1/2 times) EDM deviances.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_models>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the penalty term and thus determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
Values must be in the range `[0.0, inf)`.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (X @ coef + intercept).
|
||||
|
||||
solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
|
||||
Algorithm to use in the optimization problem:
|
||||
|
||||
'lbfgs'
|
||||
Calls scipy's L-BFGS-B optimizer.
|
||||
|
||||
'newton-cholesky'
|
||||
Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
|
||||
iterated reweighted least squares) with an inner Cholesky based solver.
|
||||
This solver is a good choice for `n_samples` >> `n_features`, especially
|
||||
with one-hot encoded categorical features with rare categories. Be aware
|
||||
that the memory usage of this solver has a quadratic dependency on
|
||||
`n_features` because it explicitly computes the Hessian matrix.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
Values must be in the range `[1, inf)`.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
Values must be in the range `(0.0, inf)`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_``.
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
Values must be in the range `[0, inf)`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
|
||||
_base_loss : BaseLoss, default=HalfSquaredError()
|
||||
This is set during fit via `self._get_loss()`.
|
||||
A `_base_loss` contains a specific loss function as well as the link
|
||||
function. The loss to be minimized specifies the distributional assumption of
|
||||
the GLM, i.e. the distribution from the EDM. Here are some examples:
|
||||
|
||||
======================= ======== ==========================
|
||||
_base_loss Link Target Domain
|
||||
======================= ======== ==========================
|
||||
HalfSquaredError identity y any real number
|
||||
HalfPoissonLoss log 0 <= y
|
||||
HalfGammaLoss log 0 < y
|
||||
HalfTweedieLoss log dependent on tweedie power
|
||||
HalfTweedieLossIdentity identity dependent on tweedie power
|
||||
======================= ======== ==========================
|
||||
|
||||
The link function of the GLM, i.e. mapping from linear predictor
|
||||
`X @ coeff + intercept` to prediction `y_pred`. For instance, with a log link,
|
||||
we have `y_pred = exp(X @ coeff + intercept)`.
|
||||
"""
|
||||
|
||||
# We allow for NewtonSolver classes for the "solver" parameter but do not
|
||||
# make them public in the docstrings. This facilitates testing and
|
||||
# benchmarking.
|
||||
_parameter_constraints: dict = {
|
||||
"alpha": [Interval(Real, 0.0, None, closed="left")],
|
||||
"fit_intercept": ["boolean"],
|
||||
"solver": [
|
||||
StrOptions({"lbfgs", "newton-cholesky"}),
|
||||
Hidden(type),
|
||||
],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"tol": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"warm_start": ["boolean"],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
alpha=1.0,
|
||||
fit_intercept=True,
|
||||
solver="lbfgs",
|
||||
max_iter=100,
|
||||
tol=1e-4,
|
||||
warm_start=False,
|
||||
verbose=0,
|
||||
):
|
||||
self.alpha = alpha
|
||||
self.fit_intercept = fit_intercept
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.warm_start = warm_start
|
||||
self.verbose = verbose
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit a Generalized Linear Model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted model.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
accept_sparse=["csc", "csr"],
|
||||
dtype=[np.float64, np.float32],
|
||||
y_numeric=True,
|
||||
multi_output=False,
|
||||
)
|
||||
|
||||
# required by losses
|
||||
if self.solver == "lbfgs":
|
||||
# lbfgs will force coef and therefore raw_prediction to be float64. The
|
||||
# base_loss needs y, X @ coef and sample_weight all of same dtype
|
||||
# (and contiguous).
|
||||
loss_dtype = np.float64
|
||||
else:
|
||||
loss_dtype = min(max(y.dtype, X.dtype), np.float64)
|
||||
y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
|
||||
|
||||
if sample_weight is not None:
|
||||
# Note that _check_sample_weight calls check_array(order="C") required by
|
||||
# losses.
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
self._base_loss = self._get_loss()
|
||||
|
||||
linear_loss = LinearModelLoss(
|
||||
base_loss=self._base_loss,
|
||||
fit_intercept=self.fit_intercept,
|
||||
)
|
||||
|
||||
if not linear_loss.base_loss.in_y_true_range(y):
|
||||
raise ValueError(
|
||||
"Some value(s) of y are out of the valid range of the loss"
|
||||
f" {self._base_loss.__class__.__name__!r}."
|
||||
)
|
||||
|
||||
# TODO: if alpha=0 check that X is not rank deficient
|
||||
|
||||
# NOTE: Rescaling of sample_weight:
|
||||
# We want to minimize
|
||||
# obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
|
||||
# + 1/2 * alpha * L2,
|
||||
# with
|
||||
# deviance = 2 * loss.
|
||||
# The objective is invariant to multiplying sample_weight by a constant. We
|
||||
# could choose this constant such that sum(sample_weight) = 1 in order to end
|
||||
# up with
|
||||
# obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
|
||||
# But LinearModelLoss.loss() already computes
|
||||
# average(loss, weights=sample_weight)
|
||||
# Thus, without rescaling, we have
|
||||
# obj = LinearModelLoss.loss(...)
|
||||
|
||||
if self.warm_start and hasattr(self, "coef_"):
|
||||
if self.fit_intercept:
|
||||
# LinearModelLoss needs intercept at the end of coefficient array.
|
||||
coef = np.concatenate((self.coef_, np.array([self.intercept_])))
|
||||
else:
|
||||
coef = self.coef_
|
||||
coef = coef.astype(loss_dtype, copy=False)
|
||||
else:
|
||||
coef = linear_loss.init_zero_coef(X, dtype=loss_dtype)
|
||||
if self.fit_intercept:
|
||||
coef[-1] = linear_loss.base_loss.link.link(
|
||||
np.average(y, weights=sample_weight)
|
||||
)
|
||||
|
||||
l2_reg_strength = self.alpha
|
||||
n_threads = _openmp_effective_n_threads()
|
||||
|
||||
# Algorithms for optimization:
|
||||
# Note again that our losses implement 1/2 * deviance.
|
||||
if self.solver == "lbfgs":
|
||||
func = linear_loss.loss_gradient
|
||||
|
||||
opt_res = scipy.optimize.minimize(
|
||||
func,
|
||||
coef,
|
||||
method="L-BFGS-B",
|
||||
jac=True,
|
||||
options={
|
||||
"maxiter": self.max_iter,
|
||||
"maxls": 50, # default is 20
|
||||
"iprint": self.verbose - 1,
|
||||
"gtol": self.tol,
|
||||
# The constant 64 was found empirically to pass the test suite.
|
||||
# The point is that ftol is very small, but a bit larger than
|
||||
# machine precision for float64, which is the dtype used by lbfgs.
|
||||
"ftol": 64 * np.finfo(float).eps,
|
||||
},
|
||||
args=(X, y, sample_weight, l2_reg_strength, n_threads),
|
||||
)
|
||||
self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
|
||||
coef = opt_res.x
|
||||
elif self.solver == "newton-cholesky":
|
||||
sol = NewtonCholeskySolver(
|
||||
coef=coef,
|
||||
linear_loss=linear_loss,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
n_threads=n_threads,
|
||||
verbose=self.verbose,
|
||||
)
|
||||
coef = sol.solve(X, y, sample_weight)
|
||||
self.n_iter_ = sol.iteration
|
||||
elif issubclass(self.solver, NewtonSolver):
|
||||
sol = self.solver(
|
||||
coef=coef,
|
||||
linear_loss=linear_loss,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
coef = sol.solve(X, y, sample_weight)
|
||||
self.n_iter_ = sol.iteration
|
||||
else:
|
||||
raise ValueError(f"Invalid solver={self.solver}.")
|
||||
|
||||
if self.fit_intercept:
|
||||
self.intercept_ = coef[-1]
|
||||
self.coef_ = coef[:-1]
|
||||
else:
|
||||
# set intercept to zero as the other linear models do
|
||||
self.intercept_ = 0.0
|
||||
self.coef_ = coef
|
||||
|
||||
return self
|
||||
|
||||
def _linear_predictor(self, X):
|
||||
"""Compute the linear_predictor = `X @ coef_ + intercept_`.
|
||||
|
||||
Note that we often use the term raw_prediction instead of linear predictor.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Returns predicted values of linear predictor.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse=["csr", "csc", "coo"],
|
||||
dtype=[np.float64, np.float32],
|
||||
ensure_2d=True,
|
||||
allow_nd=False,
|
||||
reset=False,
|
||||
)
|
||||
return X @ self.coef_ + self.intercept_
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict using GLM with feature matrix X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_pred : array of shape (n_samples,)
|
||||
Returns predicted values.
|
||||
"""
|
||||
# check_array is done in _linear_predictor
|
||||
raw_prediction = self._linear_predictor(X)
|
||||
y_pred = self._base_loss.link.inverse(raw_prediction)
|
||||
return y_pred
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Compute D^2, the percentage of deviance explained.
|
||||
|
||||
D^2 is a generalization of the coefficient of determination R^2.
|
||||
R^2 uses squared error and D^2 uses the deviance of this GLM, see the
|
||||
:ref:`User Guide <regression_metrics>`.
|
||||
|
||||
D^2 is defined as
|
||||
:math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
|
||||
:math:`D_{null}` is the null deviance, i.e. the deviance of a model
|
||||
with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
|
||||
The mean :math:`\\bar{y}` is averaged by sample_weight.
|
||||
Best possible score is 1.0 and it can be negative (because the model
|
||||
can be arbitrarily worse).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
True values of target.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
D^2 of self.predict(X) w.r.t. y.
|
||||
"""
|
||||
# TODO: Adapt link to User Guide in the docstring, once
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/22118 is merged.
|
||||
#
|
||||
# Note, default score defined in RegressorMixin is R^2 score.
|
||||
# TODO: make D^2 a score function in module metrics (and thereby get
|
||||
# input validation and so on)
|
||||
raw_prediction = self._linear_predictor(X) # validates X
|
||||
# required by losses
|
||||
y = check_array(y, dtype=raw_prediction.dtype, order="C", ensure_2d=False)
|
||||
|
||||
if sample_weight is not None:
|
||||
# Note that _check_sample_weight calls check_array(order="C") required by
|
||||
# losses.
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=y.dtype)
|
||||
|
||||
base_loss = self._base_loss
|
||||
|
||||
if not base_loss.in_y_true_range(y):
|
||||
raise ValueError(
|
||||
"Some value(s) of y are out of the valid range of the loss"
|
||||
f" {base_loss.__name__}."
|
||||
)
|
||||
|
||||
constant = np.average(
|
||||
base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
|
||||
weights=sample_weight,
|
||||
)
|
||||
|
||||
# Missing factor of 2 in deviance cancels out.
|
||||
deviance = base_loss(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=1,
|
||||
)
|
||||
y_mean = base_loss.link.link(np.average(y, weights=sample_weight))
|
||||
deviance_null = base_loss(
|
||||
y_true=y,
|
||||
raw_prediction=np.tile(y_mean, y.shape[0]),
|
||||
sample_weight=sample_weight,
|
||||
n_threads=1,
|
||||
)
|
||||
return 1 - (deviance + constant) / (deviance_null + constant)
|
||||
|
||||
def _more_tags(self):
|
||||
try:
|
||||
# Create instance of BaseLoss if fit wasn't called yet. This is necessary as
|
||||
# TweedieRegressor might set the used loss during fit different from
|
||||
# self._base_loss.
|
||||
base_loss = self._get_loss()
|
||||
return {"requires_positive_y": not base_loss.in_y_true_range(-1.0)}
|
||||
except (ValueError, AttributeError, TypeError):
|
||||
# This happens when the link or power parameter of TweedieRegressor is
|
||||
# invalid. We fallback on the default tags in that case.
|
||||
return {}
|
||||
|
||||
def _get_loss(self):
|
||||
"""This is only necessary because of the link and power arguments of the
|
||||
TweedieRegressor.
|
||||
|
||||
Note that we do not need to pass sample_weight to the loss class as this is
|
||||
only needed to set loss.constant_hessian on which GLMs do not rely.
|
||||
"""
|
||||
return HalfSquaredError()
|
||||
|
||||
|
||||
class PoissonRegressor(_GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Poisson distribution.
|
||||
|
||||
This regressor uses the 'log' link function.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_models>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the L2 penalty term and determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
Values of `alpha` must be in the range `[0.0, inf)`.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (`X @ coef + intercept`).
|
||||
|
||||
solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
|
||||
Algorithm to use in the optimization problem:
|
||||
|
||||
'lbfgs'
|
||||
Calls scipy's L-BFGS-B optimizer.
|
||||
|
||||
'newton-cholesky'
|
||||
Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
|
||||
iterated reweighted least squares) with an inner Cholesky based solver.
|
||||
This solver is a good choice for `n_samples` >> `n_features`, especially
|
||||
with one-hot encoded categorical features with rare categories. Be aware
|
||||
that the memory usage of this solver has a quadratic dependency on
|
||||
`n_features` because it explicitly computes the Hessian matrix.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
Values must be in the range `[1, inf)`.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
Values must be in the range `(0.0, inf)`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_`` .
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
Values must be in the range `[0, inf)`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
|
||||
See Also
|
||||
--------
|
||||
TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.PoissonRegressor()
|
||||
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
|
||||
>>> y = [12, 17, 22, 21]
|
||||
>>> clf.fit(X, y)
|
||||
PoissonRegressor()
|
||||
>>> clf.score(X, y)
|
||||
np.float64(0.990...)
|
||||
>>> clf.coef_
|
||||
array([0.121..., 0.158...])
|
||||
>>> clf.intercept_
|
||||
np.float64(2.088...)
|
||||
>>> clf.predict([[1, 1], [3, 4]])
|
||||
array([10.676..., 21.875...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_GeneralizedLinearRegressor._parameter_constraints
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
alpha=1.0,
|
||||
fit_intercept=True,
|
||||
solver="lbfgs",
|
||||
max_iter=100,
|
||||
tol=1e-4,
|
||||
warm_start=False,
|
||||
verbose=0,
|
||||
):
|
||||
super().__init__(
|
||||
alpha=alpha,
|
||||
fit_intercept=fit_intercept,
|
||||
solver=solver,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
warm_start=warm_start,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def _get_loss(self):
|
||||
return HalfPoissonLoss()
|
||||
|
||||
|
||||
class GammaRegressor(_GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Gamma distribution.
|
||||
|
||||
This regressor uses the 'log' link function.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_models>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the L2 penalty term and determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
Values of `alpha` must be in the range `[0.0, inf)`.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor `X @ coef_ + intercept_`.
|
||||
|
||||
solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
|
||||
Algorithm to use in the optimization problem:
|
||||
|
||||
'lbfgs'
|
||||
Calls scipy's L-BFGS-B optimizer.
|
||||
|
||||
'newton-cholesky'
|
||||
Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
|
||||
iterated reweighted least squares) with an inner Cholesky based solver.
|
||||
This solver is a good choice for `n_samples` >> `n_features`, especially
|
||||
with one-hot encoded categorical features with rare categories. Be aware
|
||||
that the memory usage of this solver has a quadratic dependency on
|
||||
`n_features` because it explicitly computes the Hessian matrix.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
Values must be in the range `[1, inf)`.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
Values must be in the range `(0.0, inf)`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for `coef_` and `intercept_`.
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
Values must be in the range `[0, inf)`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
PoissonRegressor : Generalized Linear Model with a Poisson distribution.
|
||||
TweedieRegressor : Generalized Linear Model with a Tweedie distribution.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.GammaRegressor()
|
||||
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
|
||||
>>> y = [19, 26, 33, 30]
|
||||
>>> clf.fit(X, y)
|
||||
GammaRegressor()
|
||||
>>> clf.score(X, y)
|
||||
np.float64(0.773...)
|
||||
>>> clf.coef_
|
||||
array([0.072..., 0.066...])
|
||||
>>> clf.intercept_
|
||||
np.float64(2.896...)
|
||||
>>> clf.predict([[1, 0], [2, 8]])
|
||||
array([19.483..., 35.795...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_GeneralizedLinearRegressor._parameter_constraints
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
alpha=1.0,
|
||||
fit_intercept=True,
|
||||
solver="lbfgs",
|
||||
max_iter=100,
|
||||
tol=1e-4,
|
||||
warm_start=False,
|
||||
verbose=0,
|
||||
):
|
||||
super().__init__(
|
||||
alpha=alpha,
|
||||
fit_intercept=fit_intercept,
|
||||
solver=solver,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
warm_start=warm_start,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def _get_loss(self):
|
||||
return HalfGammaLoss()
|
||||
|
||||
|
||||
class TweedieRegressor(_GeneralizedLinearRegressor):
|
||||
"""Generalized Linear Model with a Tweedie distribution.
|
||||
|
||||
This estimator can be used to model different GLMs depending on the
|
||||
``power`` parameter, which determines the underlying distribution.
|
||||
|
||||
Read more in the :ref:`User Guide <Generalized_linear_models>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
Parameters
|
||||
----------
|
||||
power : float, default=0
|
||||
The power determines the underlying target distribution according
|
||||
to the following table:
|
||||
|
||||
+-------+------------------------+
|
||||
| Power | Distribution |
|
||||
+=======+========================+
|
||||
| 0 | Normal |
|
||||
+-------+------------------------+
|
||||
| 1 | Poisson |
|
||||
+-------+------------------------+
|
||||
| (1,2) | Compound Poisson Gamma |
|
||||
+-------+------------------------+
|
||||
| 2 | Gamma |
|
||||
+-------+------------------------+
|
||||
| 3 | Inverse Gaussian |
|
||||
+-------+------------------------+
|
||||
|
||||
For ``0 < power < 1``, no distribution exists.
|
||||
|
||||
alpha : float, default=1
|
||||
Constant that multiplies the L2 penalty term and determines the
|
||||
regularization strength. ``alpha = 0`` is equivalent to unpenalized
|
||||
GLMs. In this case, the design matrix `X` must have full column rank
|
||||
(no collinearities).
|
||||
Values of `alpha` must be in the range `[0.0, inf)`.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Specifies if a constant (a.k.a. bias or intercept) should be
|
||||
added to the linear predictor (`X @ coef + intercept`).
|
||||
|
||||
link : {'auto', 'identity', 'log'}, default='auto'
|
||||
The link function of the GLM, i.e. mapping from linear predictor
|
||||
`X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
|
||||
the link depending on the chosen `power` parameter as follows:
|
||||
|
||||
- 'identity' for ``power <= 0``, e.g. for the Normal distribution
|
||||
- 'log' for ``power > 0``, e.g. for Poisson, Gamma and Inverse Gaussian
|
||||
distributions
|
||||
|
||||
solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs'
|
||||
Algorithm to use in the optimization problem:
|
||||
|
||||
'lbfgs'
|
||||
Calls scipy's L-BFGS-B optimizer.
|
||||
|
||||
'newton-cholesky'
|
||||
Uses Newton-Raphson steps (in arbitrary precision arithmetic equivalent to
|
||||
iterated reweighted least squares) with an inner Cholesky based solver.
|
||||
This solver is a good choice for `n_samples` >> `n_features`, especially
|
||||
with one-hot encoded categorical features with rare categories. Be aware
|
||||
that the memory usage of this solver has a quadratic dependency on
|
||||
`n_features` because it explicitly computes the Hessian matrix.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximal number of iterations for the solver.
|
||||
Values must be in the range `[1, inf)`.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Stopping criterion. For the lbfgs solver,
|
||||
the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
|
||||
where ``g_j`` is the j-th component of the gradient (derivative) of
|
||||
the objective function.
|
||||
Values must be in the range `(0.0, inf)`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If set to ``True``, reuse the solution of the previous call to ``fit``
|
||||
as initialization for ``coef_`` and ``intercept_`` .
|
||||
|
||||
verbose : int, default=0
|
||||
For the lbfgs solver set verbose to any positive number for verbosity.
|
||||
Values must be in the range `[0, inf)`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the linear predictor (`X @ coef_ +
|
||||
intercept_`) in the GLM.
|
||||
|
||||
intercept_ : float
|
||||
Intercept (a.k.a. bias) added to linear predictor.
|
||||
|
||||
n_iter_ : int
|
||||
Actual number of iterations used in the solver.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
PoissonRegressor : Generalized Linear Model with a Poisson distribution.
|
||||
GammaRegressor : Generalized Linear Model with a Gamma distribution.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.TweedieRegressor()
|
||||
>>> X = [[1, 2], [2, 3], [3, 4], [4, 3]]
|
||||
>>> y = [2, 3.5, 5, 5.5]
|
||||
>>> clf.fit(X, y)
|
||||
TweedieRegressor()
|
||||
>>> clf.score(X, y)
|
||||
np.float64(0.839...)
|
||||
>>> clf.coef_
|
||||
array([0.599..., 0.299...])
|
||||
>>> clf.intercept_
|
||||
np.float64(1.600...)
|
||||
>>> clf.predict([[1, 1], [3, 4]])
|
||||
array([2.500..., 4.599...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_GeneralizedLinearRegressor._parameter_constraints,
|
||||
"power": [Interval(Real, None, None, closed="neither")],
|
||||
"link": [StrOptions({"auto", "identity", "log"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
power=0.0,
|
||||
alpha=1.0,
|
||||
fit_intercept=True,
|
||||
link="auto",
|
||||
solver="lbfgs",
|
||||
max_iter=100,
|
||||
tol=1e-4,
|
||||
warm_start=False,
|
||||
verbose=0,
|
||||
):
|
||||
super().__init__(
|
||||
alpha=alpha,
|
||||
fit_intercept=fit_intercept,
|
||||
solver=solver,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
warm_start=warm_start,
|
||||
verbose=verbose,
|
||||
)
|
||||
self.link = link
|
||||
self.power = power
|
||||
|
||||
def _get_loss(self):
|
||||
if self.link == "auto":
|
||||
if self.power <= 0:
|
||||
# identity link
|
||||
return HalfTweedieLossIdentity(power=self.power)
|
||||
else:
|
||||
# log link
|
||||
return HalfTweedieLoss(power=self.power)
|
||||
|
||||
if self.link == "log":
|
||||
return HalfTweedieLoss(power=self.power)
|
||||
|
||||
if self.link == "identity":
|
||||
return HalfTweedieLossIdentity(power=self.power)
|
||||
@@ -0,0 +1,2 @@
|
||||
# Authors: The scikit-learn developers
|
||||
# SPDX-License-Identifier: BSD-3-Clause
|
||||
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,352 @@
|
||||
# Authors: Manoj Kumar mks542@nyu.edu
|
||||
# License: BSD 3 clause
|
||||
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import optimize
|
||||
|
||||
from ..base import BaseEstimator, RegressorMixin, _fit_context
|
||||
from ..utils._mask import axis0_safe_slice
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import safe_sparse_dot
|
||||
from ..utils.optimize import _check_optimize_result
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._base import LinearModel
|
||||
|
||||
|
||||
def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
|
||||
"""Returns the Huber loss and the gradient.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
w : ndarray, shape (n_features + 1,) or (n_features + 2,)
|
||||
Feature vector.
|
||||
w[:n_features] gives the coefficients
|
||||
w[-1] gives the scale factor and if the intercept is fit w[-2]
|
||||
gives the intercept factor.
|
||||
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target vector.
|
||||
|
||||
epsilon : float
|
||||
Robustness of the Huber estimator.
|
||||
|
||||
alpha : float
|
||||
Regularization parameter.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), default=None
|
||||
Weight assigned to each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
Huber loss.
|
||||
|
||||
gradient : ndarray, shape (len(w))
|
||||
Returns the derivative of the Huber loss with respect to each
|
||||
coefficient, intercept and the scale as a vector.
|
||||
"""
|
||||
_, n_features = X.shape
|
||||
fit_intercept = n_features + 2 == w.shape[0]
|
||||
if fit_intercept:
|
||||
intercept = w[-2]
|
||||
sigma = w[-1]
|
||||
w = w[:n_features]
|
||||
n_samples = np.sum(sample_weight)
|
||||
|
||||
# Calculate the values where |y - X'w -c / sigma| > epsilon
|
||||
# The values above this threshold are outliers.
|
||||
linear_loss = y - safe_sparse_dot(X, w)
|
||||
if fit_intercept:
|
||||
linear_loss -= intercept
|
||||
abs_linear_loss = np.abs(linear_loss)
|
||||
outliers_mask = abs_linear_loss > epsilon * sigma
|
||||
|
||||
# Calculate the linear loss due to the outliers.
|
||||
# This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma
|
||||
outliers = abs_linear_loss[outliers_mask]
|
||||
num_outliers = np.count_nonzero(outliers_mask)
|
||||
n_non_outliers = X.shape[0] - num_outliers
|
||||
|
||||
# n_sq_outliers includes the weight give to the outliers while
|
||||
# num_outliers is just the number of outliers.
|
||||
outliers_sw = sample_weight[outliers_mask]
|
||||
n_sw_outliers = np.sum(outliers_sw)
|
||||
outlier_loss = (
|
||||
2.0 * epsilon * np.sum(outliers_sw * outliers)
|
||||
- sigma * n_sw_outliers * epsilon**2
|
||||
)
|
||||
|
||||
# Calculate the quadratic loss due to the non-outliers.-
|
||||
# This is equal to |(y - X'w - c)**2 / sigma**2| * sigma
|
||||
non_outliers = linear_loss[~outliers_mask]
|
||||
weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers
|
||||
weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)
|
||||
squared_loss = weighted_loss / sigma
|
||||
|
||||
if fit_intercept:
|
||||
grad = np.zeros(n_features + 2)
|
||||
else:
|
||||
grad = np.zeros(n_features + 1)
|
||||
|
||||
# Gradient due to the squared loss.
|
||||
X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)
|
||||
grad[:n_features] = (
|
||||
2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)
|
||||
)
|
||||
|
||||
# Gradient due to the linear loss.
|
||||
signed_outliers = np.ones_like(outliers)
|
||||
signed_outliers_mask = linear_loss[outliers_mask] < 0
|
||||
signed_outliers[signed_outliers_mask] = -1.0
|
||||
X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)
|
||||
sw_outliers = sample_weight[outliers_mask] * signed_outliers
|
||||
grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers))
|
||||
|
||||
# Gradient due to the penalty.
|
||||
grad[:n_features] += alpha * 2.0 * w
|
||||
|
||||
# Gradient due to sigma.
|
||||
grad[-1] = n_samples
|
||||
grad[-1] -= n_sw_outliers * epsilon**2
|
||||
grad[-1] -= squared_loss / sigma
|
||||
|
||||
# Gradient due to the intercept.
|
||||
if fit_intercept:
|
||||
grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma
|
||||
grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)
|
||||
|
||||
loss = n_samples * sigma + squared_loss + outlier_loss
|
||||
loss += alpha * np.dot(w, w)
|
||||
return loss, grad
|
||||
|
||||
|
||||
class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
|
||||
"""L2-regularized linear regression model that is robust to outliers.
|
||||
|
||||
The Huber Regressor optimizes the squared loss for the samples where
|
||||
``|(y - Xw - c) / sigma| < epsilon`` and the absolute loss for the samples
|
||||
where ``|(y - Xw - c) / sigma| > epsilon``, where the model coefficients
|
||||
``w``, the intercept ``c`` and the scale ``sigma`` are parameters
|
||||
to be optimized. The parameter sigma makes sure that if y is scaled up
|
||||
or down by a certain factor, one does not need to rescale epsilon to
|
||||
achieve the same robustness. Note that this does not take into account
|
||||
the fact that the different features of X may be of different scales.
|
||||
|
||||
The Huber loss function has the advantage of not being heavily influenced
|
||||
by the outliers while not completely ignoring their effect.
|
||||
|
||||
Read more in the :ref:`User Guide <huber_regression>`
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
epsilon : float, default=1.35
|
||||
The parameter epsilon controls the number of samples that should be
|
||||
classified as outliers. The smaller the epsilon, the more robust it is
|
||||
to outliers. Epsilon must be in the range `[1, inf)`.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations that
|
||||
``scipy.optimize.minimize(method="L-BFGS-B")`` should run for.
|
||||
|
||||
alpha : float, default=0.0001
|
||||
Strength of the squared L2 regularization. Note that the penalty is
|
||||
equal to ``alpha * ||w||^2``.
|
||||
Must be in the range `[0, inf)`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
This is useful if the stored attributes of a previously used model
|
||||
has to be reused. If set to False, then the coefficients will
|
||||
be rewritten for every call to fit.
|
||||
See :term:`the Glossary <warm_start>`.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether or not to fit the intercept. This can be set to False
|
||||
if the data is already centered around the origin.
|
||||
|
||||
tol : float, default=1e-05
|
||||
The iteration will stop when
|
||||
``max{|proj g_i | i = 1, ..., n}`` <= ``tol``
|
||||
where pg_i is the i-th component of the projected gradient.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array, shape (n_features,)
|
||||
Features got by optimizing the L2-regularized Huber loss.
|
||||
|
||||
intercept_ : float
|
||||
Bias.
|
||||
|
||||
scale_ : float
|
||||
The value by which ``|y - Xw - c|`` is scaled down.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations that
|
||||
``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
|
||||
|
||||
.. versionchanged:: 0.20
|
||||
|
||||
In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
|
||||
``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
|
||||
|
||||
outliers_ : array, shape (n_samples,)
|
||||
A boolean mask which is set to True where the samples are identified
|
||||
as outliers.
|
||||
|
||||
See Also
|
||||
--------
|
||||
RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
|
||||
TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
|
||||
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
|
||||
Concomitant scale estimates, pg 172
|
||||
.. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
|
||||
https://statweb.stanford.edu/~owen/reports/hhu.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.linear_model import HuberRegressor, LinearRegression
|
||||
>>> from sklearn.datasets import make_regression
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X, y, coef = make_regression(
|
||||
... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
|
||||
>>> X[:4] = rng.uniform(10, 20, (4, 2))
|
||||
>>> y[:4] = rng.uniform(10, 20, 4)
|
||||
>>> huber = HuberRegressor().fit(X, y)
|
||||
>>> huber.score(X, y)
|
||||
-7.284...
|
||||
>>> huber.predict(X[:1,])
|
||||
array([806.7200...])
|
||||
>>> linear = LinearRegression().fit(X, y)
|
||||
>>> print("True coefficients:", coef)
|
||||
True coefficients: [20.4923... 34.1698...]
|
||||
>>> print("Huber coefficients:", huber.coef_)
|
||||
Huber coefficients: [17.7906... 31.0106...]
|
||||
>>> print("Linear Regression coefficients:", linear.coef_)
|
||||
Linear Regression coefficients: [-1.9221... 7.0226...]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"epsilon": [Interval(Real, 1.0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="left")],
|
||||
"alpha": [Interval(Real, 0, None, closed="left")],
|
||||
"warm_start": ["boolean"],
|
||||
"fit_intercept": ["boolean"],
|
||||
"tol": [Interval(Real, 0.0, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
epsilon=1.35,
|
||||
max_iter=100,
|
||||
alpha=0.0001,
|
||||
warm_start=False,
|
||||
fit_intercept=True,
|
||||
tol=1e-05,
|
||||
):
|
||||
self.epsilon = epsilon
|
||||
self.max_iter = max_iter
|
||||
self.alpha = alpha
|
||||
self.warm_start = warm_start
|
||||
self.fit_intercept = fit_intercept
|
||||
self.tol = tol
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : array-like, shape (n_samples,)
|
||||
Target vector relative to X.
|
||||
|
||||
sample_weight : array-like, shape (n_samples,)
|
||||
Weight given to each sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted `HuberRegressor` estimator.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
copy=False,
|
||||
accept_sparse=["csr"],
|
||||
y_numeric=True,
|
||||
dtype=[np.float64, np.float32],
|
||||
)
|
||||
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
if self.warm_start and hasattr(self, "coef_"):
|
||||
parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))
|
||||
else:
|
||||
if self.fit_intercept:
|
||||
parameters = np.zeros(X.shape[1] + 2)
|
||||
else:
|
||||
parameters = np.zeros(X.shape[1] + 1)
|
||||
# Make sure to initialize the scale parameter to a strictly
|
||||
# positive value:
|
||||
parameters[-1] = 1
|
||||
|
||||
# Sigma or the scale factor should be non-negative.
|
||||
# Setting it to be zero might cause undefined bounds hence we set it
|
||||
# to a value close to zero.
|
||||
bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
|
||||
bounds[-1][0] = np.finfo(np.float64).eps * 10
|
||||
|
||||
opt_res = optimize.minimize(
|
||||
_huber_loss_and_gradient,
|
||||
parameters,
|
||||
method="L-BFGS-B",
|
||||
jac=True,
|
||||
args=(X, y, self.epsilon, self.alpha, sample_weight),
|
||||
options={"maxiter": self.max_iter, "gtol": self.tol, "iprint": -1},
|
||||
bounds=bounds,
|
||||
)
|
||||
|
||||
parameters = opt_res.x
|
||||
|
||||
if opt_res.status == 2:
|
||||
raise ValueError(
|
||||
"HuberRegressor convergence failed: l-BFGS-b solver terminated with %s"
|
||||
% opt_res.message
|
||||
)
|
||||
self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
|
||||
self.scale_ = parameters[-1]
|
||||
if self.fit_intercept:
|
||||
self.intercept_ = parameters[-2]
|
||||
else:
|
||||
self.intercept_ = 0.0
|
||||
self.coef_ = parameters[: X.shape[1]]
|
||||
|
||||
residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
|
||||
self.outliers_ = residual > self.scale_ * self.epsilon
|
||||
return self
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,672 @@
|
||||
"""
|
||||
Loss functions for linear models with raw_prediction = X @ coef
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from ..utils.extmath import squared_norm
|
||||
|
||||
|
||||
class LinearModelLoss:
|
||||
"""General class for loss functions with raw_prediction = X @ coef + intercept.
|
||||
|
||||
Note that raw_prediction is also known as linear predictor.
|
||||
|
||||
The loss is the average of per sample losses and includes a term for L2
|
||||
regularization::
|
||||
|
||||
loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
|
||||
+ 1/2 * l2_reg_strength * ||coef||_2^2
|
||||
|
||||
with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
|
||||
|
||||
Gradient and hessian, for simplicity without intercept, are::
|
||||
|
||||
gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
|
||||
hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
|
||||
+ l2_reg_strength * identity
|
||||
|
||||
Conventions:
|
||||
if fit_intercept:
|
||||
n_dof = n_features + 1
|
||||
else:
|
||||
n_dof = n_features
|
||||
|
||||
if base_loss.is_multiclass:
|
||||
coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
|
||||
else:
|
||||
coef.shape = (n_dof,)
|
||||
|
||||
The intercept term is at the end of the coef array:
|
||||
if base_loss.is_multiclass:
|
||||
if coef.shape (n_classes, n_dof):
|
||||
intercept = coef[:, -1]
|
||||
if coef.shape (n_classes * n_dof,)
|
||||
intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
|
||||
intercept.shape = (n_classes,)
|
||||
else:
|
||||
intercept = coef[-1]
|
||||
|
||||
Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
|
||||
|
||||
coef.reshape((n_classes, -1), order="F")
|
||||
|
||||
The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
|
||||
coefficients without intercept, coef[:, :-1], contiguous and speeds up
|
||||
matrix-vector computations.
|
||||
|
||||
Note: If the average loss per sample is wanted instead of the sum of the loss per
|
||||
sample, one can simply use a rescaled sample_weight such that
|
||||
sum(sample_weight) = 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
base_loss : instance of class BaseLoss from sklearn._loss.
|
||||
fit_intercept : bool
|
||||
"""
|
||||
|
||||
def __init__(self, base_loss, fit_intercept):
|
||||
self.base_loss = base_loss
|
||||
self.fit_intercept = fit_intercept
|
||||
|
||||
def init_zero_coef(self, X, dtype=None):
|
||||
"""Allocate coef of correct shape with zeros.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
dtype : data-type, default=None
|
||||
Overrides the data type of coef. With dtype=None, coef will have the same
|
||||
dtype as X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
|
||||
Coefficients of a linear model.
|
||||
"""
|
||||
n_features = X.shape[1]
|
||||
n_classes = self.base_loss.n_classes
|
||||
if self.fit_intercept:
|
||||
n_dof = n_features + 1
|
||||
else:
|
||||
n_dof = n_features
|
||||
if self.base_loss.is_multiclass:
|
||||
coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F")
|
||||
else:
|
||||
coef = np.zeros_like(X, shape=n_dof, dtype=dtype)
|
||||
return coef
|
||||
|
||||
def weight_intercept(self, coef):
|
||||
"""Helper function to get coefficients and intercept.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
|
||||
Returns
|
||||
-------
|
||||
weights : ndarray of shape (n_features,) or (n_classes, n_features)
|
||||
Coefficients without intercept term.
|
||||
intercept : float or ndarray of shape (n_classes,)
|
||||
Intercept terms.
|
||||
"""
|
||||
if not self.base_loss.is_multiclass:
|
||||
if self.fit_intercept:
|
||||
intercept = coef[-1]
|
||||
weights = coef[:-1]
|
||||
else:
|
||||
intercept = 0.0
|
||||
weights = coef
|
||||
else:
|
||||
# reshape to (n_classes, n_dof)
|
||||
if coef.ndim == 1:
|
||||
weights = coef.reshape((self.base_loss.n_classes, -1), order="F")
|
||||
else:
|
||||
weights = coef
|
||||
if self.fit_intercept:
|
||||
intercept = weights[:, -1]
|
||||
weights = weights[:, :-1]
|
||||
else:
|
||||
intercept = 0.0
|
||||
|
||||
return weights, intercept
|
||||
|
||||
def weight_intercept_raw(self, coef, X):
|
||||
"""Helper function to get coefficients, intercept and raw_prediction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
weights : ndarray of shape (n_features,) or (n_classes, n_features)
|
||||
Coefficients without intercept term.
|
||||
intercept : float or ndarray of shape (n_classes,)
|
||||
Intercept terms.
|
||||
raw_prediction : ndarray of shape (n_samples,) or \
|
||||
(n_samples, n_classes)
|
||||
"""
|
||||
weights, intercept = self.weight_intercept(coef)
|
||||
|
||||
if not self.base_loss.is_multiclass:
|
||||
raw_prediction = X @ weights + intercept
|
||||
else:
|
||||
# weights has shape (n_classes, n_dof)
|
||||
raw_prediction = X @ weights.T + intercept # ndarray, likely C-contiguous
|
||||
|
||||
return weights, intercept, raw_prediction
|
||||
|
||||
def l2_penalty(self, weights, l2_reg_strength):
|
||||
"""Compute L2 penalty term l2_reg_strength/2 *||w||_2^2."""
|
||||
norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
|
||||
return 0.5 * l2_reg_strength * norm2_w
|
||||
|
||||
def loss(
|
||||
self,
|
||||
coef,
|
||||
X,
|
||||
y,
|
||||
sample_weight=None,
|
||||
l2_reg_strength=0.0,
|
||||
n_threads=1,
|
||||
raw_prediction=None,
|
||||
):
|
||||
"""Compute the loss as weighted average over point-wise losses.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : contiguous array of shape (n_samples,)
|
||||
Observed, true target values.
|
||||
sample_weight : None or contiguous array of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use.
|
||||
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
||||
shape (n_samples, n_classes)
|
||||
Raw prediction values (in link space). If provided, these are used. If
|
||||
None, then raw_prediction = X @ coef + intercept is calculated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
Weighted average of losses per sample, plus penalty.
|
||||
"""
|
||||
if raw_prediction is None:
|
||||
weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
|
||||
else:
|
||||
weights, intercept = self.weight_intercept(coef)
|
||||
|
||||
loss = self.base_loss.loss(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=None,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
loss = np.average(loss, weights=sample_weight)
|
||||
|
||||
return loss + self.l2_penalty(weights, l2_reg_strength)
|
||||
|
||||
def loss_gradient(
|
||||
self,
|
||||
coef,
|
||||
X,
|
||||
y,
|
||||
sample_weight=None,
|
||||
l2_reg_strength=0.0,
|
||||
n_threads=1,
|
||||
raw_prediction=None,
|
||||
):
|
||||
"""Computes the sum of loss and gradient w.r.t. coef.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : contiguous array of shape (n_samples,)
|
||||
Observed, true target values.
|
||||
sample_weight : None or contiguous array of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use.
|
||||
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
||||
shape (n_samples, n_classes)
|
||||
Raw prediction values (in link space). If provided, these are used. If
|
||||
None, then raw_prediction = X @ coef + intercept is calculated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
Weighted average of losses per sample, plus penalty.
|
||||
|
||||
gradient : ndarray of shape coef.shape
|
||||
The gradient of the loss.
|
||||
"""
|
||||
(n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
|
||||
n_dof = n_features + int(self.fit_intercept)
|
||||
|
||||
if raw_prediction is None:
|
||||
weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
|
||||
else:
|
||||
weights, intercept = self.weight_intercept(coef)
|
||||
|
||||
loss, grad_pointwise = self.base_loss.loss_gradient(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
|
||||
loss = loss.sum() / sw_sum
|
||||
loss += self.l2_penalty(weights, l2_reg_strength)
|
||||
|
||||
grad_pointwise /= sw_sum
|
||||
|
||||
if not self.base_loss.is_multiclass:
|
||||
grad = np.empty_like(coef, dtype=weights.dtype)
|
||||
grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[-1] = grad_pointwise.sum()
|
||||
else:
|
||||
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
|
||||
# grad_pointwise.shape = (n_samples, n_classes)
|
||||
grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[:, -1] = grad_pointwise.sum(axis=0)
|
||||
if coef.ndim == 1:
|
||||
grad = grad.ravel(order="F")
|
||||
|
||||
return loss, grad
|
||||
|
||||
def gradient(
|
||||
self,
|
||||
coef,
|
||||
X,
|
||||
y,
|
||||
sample_weight=None,
|
||||
l2_reg_strength=0.0,
|
||||
n_threads=1,
|
||||
raw_prediction=None,
|
||||
):
|
||||
"""Computes the gradient w.r.t. coef.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : contiguous array of shape (n_samples,)
|
||||
Observed, true target values.
|
||||
sample_weight : None or contiguous array of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use.
|
||||
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
||||
shape (n_samples, n_classes)
|
||||
Raw prediction values (in link space). If provided, these are used. If
|
||||
None, then raw_prediction = X @ coef + intercept is calculated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
gradient : ndarray of shape coef.shape
|
||||
The gradient of the loss.
|
||||
"""
|
||||
(n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
|
||||
n_dof = n_features + int(self.fit_intercept)
|
||||
|
||||
if raw_prediction is None:
|
||||
weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
|
||||
else:
|
||||
weights, intercept = self.weight_intercept(coef)
|
||||
|
||||
grad_pointwise = self.base_loss.gradient(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
|
||||
grad_pointwise /= sw_sum
|
||||
|
||||
if not self.base_loss.is_multiclass:
|
||||
grad = np.empty_like(coef, dtype=weights.dtype)
|
||||
grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[-1] = grad_pointwise.sum()
|
||||
return grad
|
||||
else:
|
||||
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
|
||||
# gradient.shape = (n_samples, n_classes)
|
||||
grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[:, -1] = grad_pointwise.sum(axis=0)
|
||||
if coef.ndim == 1:
|
||||
return grad.ravel(order="F")
|
||||
else:
|
||||
return grad
|
||||
|
||||
def gradient_hessian(
|
||||
self,
|
||||
coef,
|
||||
X,
|
||||
y,
|
||||
sample_weight=None,
|
||||
l2_reg_strength=0.0,
|
||||
n_threads=1,
|
||||
gradient_out=None,
|
||||
hessian_out=None,
|
||||
raw_prediction=None,
|
||||
):
|
||||
"""Computes gradient and hessian w.r.t. coef.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : contiguous array of shape (n_samples,)
|
||||
Observed, true target values.
|
||||
sample_weight : None or contiguous array of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use.
|
||||
gradient_out : None or ndarray of shape coef.shape
|
||||
A location into which the gradient is stored. If None, a new array
|
||||
might be created.
|
||||
hessian_out : None or ndarray
|
||||
A location into which the hessian is stored. If None, a new array
|
||||
might be created.
|
||||
raw_prediction : C-contiguous array of shape (n_samples,) or array of \
|
||||
shape (n_samples, n_classes)
|
||||
Raw prediction values (in link space). If provided, these are used. If
|
||||
None, then raw_prediction = X @ coef + intercept is calculated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
gradient : ndarray of shape coef.shape
|
||||
The gradient of the loss.
|
||||
|
||||
hessian : ndarray
|
||||
Hessian matrix.
|
||||
|
||||
hessian_warning : bool
|
||||
True if pointwise hessian has more than half of its elements non-positive.
|
||||
"""
|
||||
n_samples, n_features = X.shape
|
||||
n_dof = n_features + int(self.fit_intercept)
|
||||
|
||||
if raw_prediction is None:
|
||||
weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
|
||||
else:
|
||||
weights, intercept = self.weight_intercept(coef)
|
||||
|
||||
grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
|
||||
grad_pointwise /= sw_sum
|
||||
hess_pointwise /= sw_sum
|
||||
|
||||
# For non-canonical link functions and far away from the optimum, the pointwise
|
||||
# hessian can be negative. We take care that 75% of the hessian entries are
|
||||
# positive.
|
||||
hessian_warning = np.mean(hess_pointwise <= 0) > 0.25
|
||||
hess_pointwise = np.abs(hess_pointwise)
|
||||
|
||||
if not self.base_loss.is_multiclass:
|
||||
# gradient
|
||||
if gradient_out is None:
|
||||
grad = np.empty_like(coef, dtype=weights.dtype)
|
||||
else:
|
||||
grad = gradient_out
|
||||
grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[-1] = grad_pointwise.sum()
|
||||
|
||||
# hessian
|
||||
if hessian_out is None:
|
||||
hess = np.empty(shape=(n_dof, n_dof), dtype=weights.dtype)
|
||||
else:
|
||||
hess = hessian_out
|
||||
|
||||
if hessian_warning:
|
||||
# Exit early without computing the hessian.
|
||||
return grad, hess, hessian_warning
|
||||
|
||||
# TODO: This "sandwich product", X' diag(W) X, is the main computational
|
||||
# bottleneck for solvers. A dedicated Cython routine might improve it
|
||||
# exploiting the symmetry (as opposed to, e.g., BLAS gemm).
|
||||
if sparse.issparse(X):
|
||||
hess[:n_features, :n_features] = (
|
||||
X.T
|
||||
@ sparse.dia_matrix(
|
||||
(hess_pointwise, 0), shape=(n_samples, n_samples)
|
||||
)
|
||||
@ X
|
||||
).toarray()
|
||||
else:
|
||||
# np.einsum may use less memory but the following, using BLAS matrix
|
||||
# multiplication (gemm), is by far faster.
|
||||
WX = hess_pointwise[:, None] * X
|
||||
hess[:n_features, :n_features] = np.dot(X.T, WX)
|
||||
|
||||
if l2_reg_strength > 0:
|
||||
# The L2 penalty enters the Hessian on the diagonal only. To add those
|
||||
# terms, we use a flattened view on the array.
|
||||
hess.reshape(-1)[
|
||||
: (n_features * n_dof) : (n_dof + 1)
|
||||
] += l2_reg_strength
|
||||
|
||||
if self.fit_intercept:
|
||||
# With intercept included as added column to X, the hessian becomes
|
||||
# hess = (X, 1)' @ diag(h) @ (X, 1)
|
||||
# = (X' @ diag(h) @ X, X' @ h)
|
||||
# ( h @ X, sum(h))
|
||||
# The left upper part has already been filled, it remains to compute
|
||||
# the last row and the last column.
|
||||
Xh = X.T @ hess_pointwise
|
||||
hess[:-1, -1] = Xh
|
||||
hess[-1, :-1] = Xh
|
||||
hess[-1, -1] = hess_pointwise.sum()
|
||||
else:
|
||||
# Here we may safely assume HalfMultinomialLoss aka categorical
|
||||
# cross-entropy.
|
||||
raise NotImplementedError
|
||||
|
||||
return grad, hess, hessian_warning
|
||||
|
||||
def gradient_hessian_product(
|
||||
self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
|
||||
):
|
||||
"""Computes gradient and hessp (hessian product function) w.r.t. coef.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
|
||||
Coefficients of a linear model.
|
||||
If shape (n_classes * n_dof,), the classes of one feature are contiguous,
|
||||
i.e. one reconstructs the 2d-array via
|
||||
coef.reshape((n_classes, -1), order="F").
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : contiguous array of shape (n_samples,)
|
||||
Observed, true target values.
|
||||
sample_weight : None or contiguous array of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
l2_reg_strength : float, default=0.0
|
||||
L2 regularization strength
|
||||
n_threads : int, default=1
|
||||
Number of OpenMP threads to use.
|
||||
|
||||
Returns
|
||||
-------
|
||||
gradient : ndarray of shape coef.shape
|
||||
The gradient of the loss.
|
||||
|
||||
hessp : callable
|
||||
Function that takes in a vector input of shape of gradient and
|
||||
and returns matrix-vector product with hessian.
|
||||
"""
|
||||
(n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
|
||||
n_dof = n_features + int(self.fit_intercept)
|
||||
weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
|
||||
sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
|
||||
|
||||
if not self.base_loss.is_multiclass:
|
||||
grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
grad_pointwise /= sw_sum
|
||||
hess_pointwise /= sw_sum
|
||||
grad = np.empty_like(coef, dtype=weights.dtype)
|
||||
grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[-1] = grad_pointwise.sum()
|
||||
|
||||
# Precompute as much as possible: hX, hX_sum and hessian_sum
|
||||
hessian_sum = hess_pointwise.sum()
|
||||
if sparse.issparse(X):
|
||||
hX = (
|
||||
sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples))
|
||||
@ X
|
||||
)
|
||||
else:
|
||||
hX = hess_pointwise[:, np.newaxis] * X
|
||||
|
||||
if self.fit_intercept:
|
||||
# Calculate the double derivative with respect to intercept.
|
||||
# Note: In case hX is sparse, hX.sum is a matrix object.
|
||||
hX_sum = np.squeeze(np.asarray(hX.sum(axis=0)))
|
||||
# prevent squeezing to zero-dim array if n_features == 1
|
||||
hX_sum = np.atleast_1d(hX_sum)
|
||||
|
||||
# With intercept included and l2_reg_strength = 0, hessp returns
|
||||
# res = (X, 1)' @ diag(h) @ (X, 1) @ s
|
||||
# = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1])
|
||||
# res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1]
|
||||
# res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1]
|
||||
def hessp(s):
|
||||
ret = np.empty_like(s)
|
||||
if sparse.issparse(X):
|
||||
ret[:n_features] = X.T @ (hX @ s[:n_features])
|
||||
else:
|
||||
ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]])
|
||||
ret[:n_features] += l2_reg_strength * s[:n_features]
|
||||
|
||||
if self.fit_intercept:
|
||||
ret[:n_features] += s[-1] * hX_sum
|
||||
ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1]
|
||||
return ret
|
||||
|
||||
else:
|
||||
# Here we may safely assume HalfMultinomialLoss aka categorical
|
||||
# cross-entropy.
|
||||
# HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
|
||||
# diagonal in the classes. Here, we want the matrix-vector product of the
|
||||
# full hessian. Therefore, we call gradient_proba.
|
||||
grad_pointwise, proba = self.base_loss.gradient_proba(
|
||||
y_true=y,
|
||||
raw_prediction=raw_prediction,
|
||||
sample_weight=sample_weight,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
grad_pointwise /= sw_sum
|
||||
grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
|
||||
grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
|
||||
if self.fit_intercept:
|
||||
grad[:, -1] = grad_pointwise.sum(axis=0)
|
||||
|
||||
# Full hessian-vector product, i.e. not only the diagonal part of the
|
||||
# hessian. Derivation with some index battle for input vector s:
|
||||
# - sample index i
|
||||
# - feature indices j, m
|
||||
# - class indices k, l
|
||||
# - 1_{k=l} is one if k=l else 0
|
||||
# - p_i_k is the (predicted) probability that sample i belongs to class k
|
||||
# for all i: sum_k p_i_k = 1
|
||||
# - s_l_m is input vector for class l and feature m
|
||||
# - X' = X transposed
|
||||
#
|
||||
# Note: Hessian with dropping most indices is just:
|
||||
# X' @ p_k (1(k=l) - p_l) @ X
|
||||
#
|
||||
# result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m
|
||||
# = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l)
|
||||
# * X_{im} s_l_m
|
||||
# = sum_{i, m} (X')_{ji} * p_i_k
|
||||
# * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m)
|
||||
#
|
||||
# See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa
|
||||
def hessp(s):
|
||||
s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof)
|
||||
if self.fit_intercept:
|
||||
s_intercept = s[:, -1]
|
||||
s = s[:, :-1] # shape = (n_classes, n_features)
|
||||
else:
|
||||
s_intercept = 0
|
||||
tmp = X @ s.T + s_intercept # X_{im} * s_k_m
|
||||
tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis] # - sum_l ..
|
||||
tmp *= proba # * p_i_k
|
||||
if sample_weight is not None:
|
||||
tmp *= sample_weight[:, np.newaxis]
|
||||
# hess_prod = empty_like(grad), but we ravel grad below and this
|
||||
# function is run after that.
|
||||
hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
|
||||
hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
|
||||
if self.fit_intercept:
|
||||
hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
|
||||
if coef.ndim == 1:
|
||||
return hess_prod.ravel(order="F")
|
||||
else:
|
||||
return hess_prod
|
||||
|
||||
if coef.ndim == 1:
|
||||
return grad.ravel(order="F"), hessp
|
||||
|
||||
return grad, hessp
|
||||
2287
.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py
Normal file
2287
.venv/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py
Normal file
File diff suppressed because it is too large
Load Diff
1121
.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py
Normal file
1121
.venv/lib/python3.12/site-packages/sklearn/linear_model/_omp.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,579 @@
|
||||
# Authors: Rob Zinkov, Mathieu Blondel
|
||||
# License: BSD 3 clause
|
||||
from numbers import Real
|
||||
|
||||
from ..base import _fit_context
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ._stochastic_gradient import DEFAULT_EPSILON, BaseSGDClassifier, BaseSGDRegressor
|
||||
|
||||
|
||||
class PassiveAggressiveClassifier(BaseSGDClassifier):
|
||||
"""Passive Aggressive Classifier.
|
||||
|
||||
Read more in the :ref:`User Guide <passive_aggressive>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
C : float, default=1.0
|
||||
Maximum step size (regularization). Defaults to 1.0.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether the intercept should be estimated or not. If False, the
|
||||
data is assumed to be already centered.
|
||||
|
||||
max_iter : int, default=1000
|
||||
The maximum number of passes over the training data (aka epochs).
|
||||
It only impacts the behavior in the ``fit`` method, and not the
|
||||
:meth:`~sklearn.linear_model.PassiveAggressiveClassifier.partial_fit` method.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
tol : float or None, default=1e-3
|
||||
The stopping criterion. If it is not None, the iterations will stop
|
||||
when (loss > previous_loss - tol).
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
early_stopping : bool, default=False
|
||||
Whether to use early stopping to terminate training when validation
|
||||
score is not improving. If set to True, it will automatically set aside
|
||||
a stratified fraction of training data as validation and terminate
|
||||
training when validation score is not improving by at least `tol` for
|
||||
`n_iter_no_change` consecutive epochs.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
validation_fraction : float, default=0.1
|
||||
The proportion of training data to set aside as validation set for
|
||||
early stopping. Must be between 0 and 1.
|
||||
Only used if early_stopping is True.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_iter_no_change : int, default=5
|
||||
Number of iterations with no improvement to wait before early stopping.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
shuffle : bool, default=True
|
||||
Whether or not the training data should be shuffled after each epoch.
|
||||
|
||||
verbose : int, default=0
|
||||
The verbosity level.
|
||||
|
||||
loss : str, default="hinge"
|
||||
The loss function to be used:
|
||||
hinge: equivalent to PA-I in the reference paper.
|
||||
squared_hinge: equivalent to PA-II in the reference paper.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of CPUs to use to do the OVA (One Versus All, for
|
||||
multi-class problems) computation.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used to shuffle the training data, when ``shuffle`` is set to
|
||||
``True``. Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
When set to True, reuse the solution of the previous call to fit as
|
||||
initialization, otherwise, just erase the previous solution.
|
||||
See :term:`the Glossary <warm_start>`.
|
||||
|
||||
Repeatedly calling fit or partial_fit when warm_start is True can
|
||||
result in a different solution than when calling fit a single time
|
||||
because of the way the data is shuffled.
|
||||
|
||||
class_weight : dict, {class_label: weight} or "balanced" or None, \
|
||||
default=None
|
||||
Preset for the class_weight fit parameter.
|
||||
|
||||
Weights associated with classes. If not given, all classes
|
||||
are supposed to have weight one.
|
||||
|
||||
The "balanced" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data
|
||||
as ``n_samples / (n_classes * np.bincount(y))``.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
parameter *class_weight* to automatically weight samples.
|
||||
|
||||
average : bool or int, default=False
|
||||
When set to True, computes the averaged SGD weights and stores the
|
||||
result in the ``coef_`` attribute. If set to an int greater than 1,
|
||||
averaging will begin once the total number of samples seen reaches
|
||||
average. So average=10 will begin averaging after seeing 10 samples.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
parameter *average* to use weights averaging in SGD.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
|
||||
(n_classes, n_features)
|
||||
Weights assigned to the features.
|
||||
|
||||
intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
|
||||
Constants in decision function.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
For multiclass fits, it is the maximum over every binary fit.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The unique classes labels.
|
||||
|
||||
t_ : int
|
||||
Number of weight updates performed during training.
|
||||
Same as ``(n_iter_ * n_samples + 1)``.
|
||||
|
||||
loss_function_ : callable
|
||||
Loss function used by the algorithm.
|
||||
|
||||
.. deprecated:: 1.4
|
||||
Attribute `loss_function_` was deprecated in version 1.4 and will be
|
||||
removed in 1.6.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SGDClassifier : Incrementally trained logistic regression.
|
||||
Perceptron : Linear perceptron classifier.
|
||||
|
||||
References
|
||||
----------
|
||||
Online Passive-Aggressive Algorithms
|
||||
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
|
||||
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.linear_model import PassiveAggressiveClassifier
|
||||
>>> from sklearn.datasets import make_classification
|
||||
>>> X, y = make_classification(n_features=4, random_state=0)
|
||||
>>> clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,
|
||||
... tol=1e-3)
|
||||
>>> clf.fit(X, y)
|
||||
PassiveAggressiveClassifier(random_state=0)
|
||||
>>> print(clf.coef_)
|
||||
[[0.26642044 0.45070924 0.67251877 0.64185414]]
|
||||
>>> print(clf.intercept_)
|
||||
[1.84127814]
|
||||
>>> print(clf.predict([[0, 0, 0, 0]]))
|
||||
[1]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSGDClassifier._parameter_constraints,
|
||||
"loss": [StrOptions({"hinge", "squared_hinge"})],
|
||||
"C": [Interval(Real, 0, None, closed="right")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
C=1.0,
|
||||
fit_intercept=True,
|
||||
max_iter=1000,
|
||||
tol=1e-3,
|
||||
early_stopping=False,
|
||||
validation_fraction=0.1,
|
||||
n_iter_no_change=5,
|
||||
shuffle=True,
|
||||
verbose=0,
|
||||
loss="hinge",
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
warm_start=False,
|
||||
class_weight=None,
|
||||
average=False,
|
||||
):
|
||||
super().__init__(
|
||||
penalty=None,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
early_stopping=early_stopping,
|
||||
validation_fraction=validation_fraction,
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
shuffle=shuffle,
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
eta0=1.0,
|
||||
warm_start=warm_start,
|
||||
class_weight=class_weight,
|
||||
average=average,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
|
||||
self.C = C
|
||||
self.loss = loss
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X, y, classes=None):
|
||||
"""Fit linear model with Passive Aggressive algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Subset of the training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Subset of the target values.
|
||||
|
||||
classes : ndarray of shape (n_classes,)
|
||||
Classes across all calls to partial_fit.
|
||||
Can be obtained by via `np.unique(y_all)`, where y_all is the
|
||||
target vector of the entire dataset.
|
||||
This argument is required for the first call to partial_fit
|
||||
and can be omitted in the subsequent calls.
|
||||
Note that y doesn't need to contain all labels in `classes`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
if not hasattr(self, "classes_"):
|
||||
self._more_validate_params(for_partial_fit=True)
|
||||
|
||||
if self.class_weight == "balanced":
|
||||
raise ValueError(
|
||||
"class_weight 'balanced' is not supported for "
|
||||
"partial_fit. For 'balanced' weights, use "
|
||||
"`sklearn.utils.compute_class_weight` with "
|
||||
"`class_weight='balanced'`. In place of y you "
|
||||
"can use a large enough subset of the full "
|
||||
"training set target to properly estimate the "
|
||||
"class frequency distributions. Pass the "
|
||||
"resulting weights as the class_weight "
|
||||
"parameter."
|
||||
)
|
||||
|
||||
lr = "pa1" if self.loss == "hinge" else "pa2"
|
||||
return self._partial_fit(
|
||||
X,
|
||||
y,
|
||||
alpha=1.0,
|
||||
C=self.C,
|
||||
loss="hinge",
|
||||
learning_rate=lr,
|
||||
max_iter=1,
|
||||
classes=classes,
|
||||
sample_weight=None,
|
||||
coef_init=None,
|
||||
intercept_init=None,
|
||||
)
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, coef_init=None, intercept_init=None):
|
||||
"""Fit linear model with Passive Aggressive algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
coef_init : ndarray of shape (n_classes, n_features)
|
||||
The initial coefficients to warm-start the optimization.
|
||||
|
||||
intercept_init : ndarray of shape (n_classes,)
|
||||
The initial intercept to warm-start the optimization.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self._more_validate_params()
|
||||
|
||||
lr = "pa1" if self.loss == "hinge" else "pa2"
|
||||
return self._fit(
|
||||
X,
|
||||
y,
|
||||
alpha=1.0,
|
||||
C=self.C,
|
||||
loss="hinge",
|
||||
learning_rate=lr,
|
||||
coef_init=coef_init,
|
||||
intercept_init=intercept_init,
|
||||
)
|
||||
|
||||
|
||||
class PassiveAggressiveRegressor(BaseSGDRegressor):
|
||||
"""Passive Aggressive Regressor.
|
||||
|
||||
Read more in the :ref:`User Guide <passive_aggressive>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
C : float, default=1.0
|
||||
Maximum step size (regularization). Defaults to 1.0.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether the intercept should be estimated or not. If False, the
|
||||
data is assumed to be already centered. Defaults to True.
|
||||
|
||||
max_iter : int, default=1000
|
||||
The maximum number of passes over the training data (aka epochs).
|
||||
It only impacts the behavior in the ``fit`` method, and not the
|
||||
:meth:`~sklearn.linear_model.PassiveAggressiveRegressor.partial_fit` method.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
tol : float or None, default=1e-3
|
||||
The stopping criterion. If it is not None, the iterations will stop
|
||||
when (loss > previous_loss - tol).
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
early_stopping : bool, default=False
|
||||
Whether to use early stopping to terminate training when validation.
|
||||
score is not improving. If set to True, it will automatically set aside
|
||||
a fraction of training data as validation and terminate
|
||||
training when validation score is not improving by at least tol for
|
||||
n_iter_no_change consecutive epochs.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
validation_fraction : float, default=0.1
|
||||
The proportion of training data to set aside as validation set for
|
||||
early stopping. Must be between 0 and 1.
|
||||
Only used if early_stopping is True.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_iter_no_change : int, default=5
|
||||
Number of iterations with no improvement to wait before early stopping.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
shuffle : bool, default=True
|
||||
Whether or not the training data should be shuffled after each epoch.
|
||||
|
||||
verbose : int, default=0
|
||||
The verbosity level.
|
||||
|
||||
loss : str, default="epsilon_insensitive"
|
||||
The loss function to be used:
|
||||
epsilon_insensitive: equivalent to PA-I in the reference paper.
|
||||
squared_epsilon_insensitive: equivalent to PA-II in the reference
|
||||
paper.
|
||||
|
||||
epsilon : float, default=0.1
|
||||
If the difference between the current prediction and the correct label
|
||||
is below this threshold, the model is not updated.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used to shuffle the training data, when ``shuffle`` is set to
|
||||
``True``. Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
warm_start : bool, default=False
|
||||
When set to True, reuse the solution of the previous call to fit as
|
||||
initialization, otherwise, just erase the previous solution.
|
||||
See :term:`the Glossary <warm_start>`.
|
||||
|
||||
Repeatedly calling fit or partial_fit when warm_start is True can
|
||||
result in a different solution than when calling fit a single time
|
||||
because of the way the data is shuffled.
|
||||
|
||||
average : bool or int, default=False
|
||||
When set to True, computes the averaged SGD weights and stores the
|
||||
result in the ``coef_`` attribute. If set to an int greater than 1,
|
||||
averaging will begin once the total number of samples seen reaches
|
||||
average. So average=10 will begin averaging after seeing 10 samples.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
parameter *average* to use weights averaging in SGD.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array, shape = [1, n_features] if n_classes == 2 else [n_classes,\
|
||||
n_features]
|
||||
Weights assigned to the features.
|
||||
|
||||
intercept_ : array, shape = [1] if n_classes == 2 else [n_classes]
|
||||
Constants in decision function.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
t_ : int
|
||||
Number of weight updates performed during training.
|
||||
Same as ``(n_iter_ * n_samples + 1)``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SGDRegressor : Linear model fitted by minimizing a regularized
|
||||
empirical loss with SGD.
|
||||
|
||||
References
|
||||
----------
|
||||
Online Passive-Aggressive Algorithms
|
||||
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>
|
||||
K. Crammer, O. Dekel, J. Keshat, S. Shalev-Shwartz, Y. Singer - JMLR (2006).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.linear_model import PassiveAggressiveRegressor
|
||||
>>> from sklearn.datasets import make_regression
|
||||
|
||||
>>> X, y = make_regression(n_features=4, random_state=0)
|
||||
>>> regr = PassiveAggressiveRegressor(max_iter=100, random_state=0,
|
||||
... tol=1e-3)
|
||||
>>> regr.fit(X, y)
|
||||
PassiveAggressiveRegressor(max_iter=100, random_state=0)
|
||||
>>> print(regr.coef_)
|
||||
[20.48736655 34.18818427 67.59122734 87.94731329]
|
||||
>>> print(regr.intercept_)
|
||||
[-0.02306214]
|
||||
>>> print(regr.predict([[0, 0, 0, 0]]))
|
||||
[-0.02306214]
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSGDRegressor._parameter_constraints,
|
||||
"loss": [StrOptions({"epsilon_insensitive", "squared_epsilon_insensitive"})],
|
||||
"C": [Interval(Real, 0, None, closed="right")],
|
||||
"epsilon": [Interval(Real, 0, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
C=1.0,
|
||||
fit_intercept=True,
|
||||
max_iter=1000,
|
||||
tol=1e-3,
|
||||
early_stopping=False,
|
||||
validation_fraction=0.1,
|
||||
n_iter_no_change=5,
|
||||
shuffle=True,
|
||||
verbose=0,
|
||||
loss="epsilon_insensitive",
|
||||
epsilon=DEFAULT_EPSILON,
|
||||
random_state=None,
|
||||
warm_start=False,
|
||||
average=False,
|
||||
):
|
||||
super().__init__(
|
||||
penalty=None,
|
||||
l1_ratio=0,
|
||||
epsilon=epsilon,
|
||||
eta0=1.0,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
early_stopping=early_stopping,
|
||||
validation_fraction=validation_fraction,
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
shuffle=shuffle,
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
warm_start=warm_start,
|
||||
average=average,
|
||||
)
|
||||
self.C = C
|
||||
self.loss = loss
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X, y):
|
||||
"""Fit linear model with Passive Aggressive algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Subset of training data.
|
||||
|
||||
y : numpy array of shape [n_samples]
|
||||
Subset of target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
if not hasattr(self, "coef_"):
|
||||
self._more_validate_params(for_partial_fit=True)
|
||||
|
||||
lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
|
||||
return self._partial_fit(
|
||||
X,
|
||||
y,
|
||||
alpha=1.0,
|
||||
C=self.C,
|
||||
loss="epsilon_insensitive",
|
||||
learning_rate=lr,
|
||||
max_iter=1,
|
||||
sample_weight=None,
|
||||
coef_init=None,
|
||||
intercept_init=None,
|
||||
)
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, coef_init=None, intercept_init=None):
|
||||
"""Fit linear model with Passive Aggressive algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : numpy array of shape [n_samples]
|
||||
Target values.
|
||||
|
||||
coef_init : array, shape = [n_features]
|
||||
The initial coefficients to warm-start the optimization.
|
||||
|
||||
intercept_init : array, shape = [1]
|
||||
The initial intercept to warm-start the optimization.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self._more_validate_params()
|
||||
|
||||
lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
|
||||
return self._fit(
|
||||
X,
|
||||
y,
|
||||
alpha=1.0,
|
||||
C=self.C,
|
||||
loss="epsilon_insensitive",
|
||||
learning_rate=lr,
|
||||
coef_init=coef_init,
|
||||
intercept_init=intercept_init,
|
||||
)
|
||||
@@ -0,0 +1,229 @@
|
||||
# Author: Mathieu Blondel
|
||||
# License: BSD 3 clause
|
||||
from numbers import Real
|
||||
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ._stochastic_gradient import BaseSGDClassifier
|
||||
|
||||
|
||||
class Perceptron(BaseSGDClassifier):
|
||||
"""Linear perceptron classifier.
|
||||
|
||||
The implementation is a wrapper around :class:`~sklearn.linear_model.SGDClassifier`
|
||||
by fixing the `loss` and `learning_rate` parameters as::
|
||||
|
||||
SGDClassifier(loss="perceptron", learning_rate="constant")
|
||||
|
||||
Other available parameters are described below and are forwarded to
|
||||
:class:`~sklearn.linear_model.SGDClassifier`.
|
||||
|
||||
Read more in the :ref:`User Guide <perceptron>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
penalty : {'l2','l1','elasticnet'}, default=None
|
||||
The penalty (aka regularization term) to be used.
|
||||
|
||||
alpha : float, default=0.0001
|
||||
Constant that multiplies the regularization term if regularization is
|
||||
used.
|
||||
|
||||
l1_ratio : float, default=0.15
|
||||
The Elastic Net mixing parameter, with `0 <= l1_ratio <= 1`.
|
||||
`l1_ratio=0` corresponds to L2 penalty, `l1_ratio=1` to L1.
|
||||
Only used if `penalty='elasticnet'`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether the intercept should be estimated or not. If False, the
|
||||
data is assumed to be already centered.
|
||||
|
||||
max_iter : int, default=1000
|
||||
The maximum number of passes over the training data (aka epochs).
|
||||
It only impacts the behavior in the ``fit`` method, and not the
|
||||
:meth:`partial_fit` method.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
tol : float or None, default=1e-3
|
||||
The stopping criterion. If it is not None, the iterations will stop
|
||||
when (loss > previous_loss - tol).
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
shuffle : bool, default=True
|
||||
Whether or not the training data should be shuffled after each epoch.
|
||||
|
||||
verbose : int, default=0
|
||||
The verbosity level.
|
||||
|
||||
eta0 : float, default=1
|
||||
Constant by which the updates are multiplied.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of CPUs to use to do the OVA (One Versus All, for
|
||||
multi-class problems) computation.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance or None, default=0
|
||||
Used to shuffle the training data, when ``shuffle`` is set to
|
||||
``True``. Pass an int for reproducible output across multiple
|
||||
function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
early_stopping : bool, default=False
|
||||
Whether to use early stopping to terminate training when validation
|
||||
score is not improving. If set to True, it will automatically set aside
|
||||
a stratified fraction of training data as validation and terminate
|
||||
training when validation score is not improving by at least `tol` for
|
||||
`n_iter_no_change` consecutive epochs.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
validation_fraction : float, default=0.1
|
||||
The proportion of training data to set aside as validation set for
|
||||
early stopping. Must be between 0 and 1.
|
||||
Only used if early_stopping is True.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_iter_no_change : int, default=5
|
||||
Number of iterations with no improvement to wait before early stopping.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
class_weight : dict, {class_label: weight} or "balanced", default=None
|
||||
Preset for the class_weight fit parameter.
|
||||
|
||||
Weights associated with classes. If not given, all classes
|
||||
are supposed to have weight one.
|
||||
|
||||
The "balanced" mode uses the values of y to automatically adjust
|
||||
weights inversely proportional to class frequencies in the input data
|
||||
as ``n_samples / (n_classes * np.bincount(y))``.
|
||||
|
||||
warm_start : bool, default=False
|
||||
When set to True, reuse the solution of the previous call to fit as
|
||||
initialization, otherwise, just erase the previous solution. See
|
||||
:term:`the Glossary <warm_start>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The unique classes labels.
|
||||
|
||||
coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
|
||||
(n_classes, n_features)
|
||||
Weights assigned to the features.
|
||||
|
||||
intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
|
||||
Constants in decision function.
|
||||
|
||||
loss_function_ : concrete LossFunction
|
||||
The function that determines the loss, or difference between the
|
||||
output of the algorithm and the target values.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
For multiclass fits, it is the maximum over every binary fit.
|
||||
|
||||
t_ : int
|
||||
Number of weight updates performed during training.
|
||||
Same as ``(n_iter_ * n_samples + 1)``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.linear_model.SGDClassifier : Linear classifiers
|
||||
(SVM, logistic regression, etc.) with SGD training.
|
||||
|
||||
Notes
|
||||
-----
|
||||
``Perceptron`` is a classification algorithm which shares the same
|
||||
underlying implementation with ``SGDClassifier``. In fact,
|
||||
``Perceptron()`` is equivalent to `SGDClassifier(loss="perceptron",
|
||||
eta0=1, learning_rate="constant", penalty=None)`.
|
||||
|
||||
References
|
||||
----------
|
||||
https://en.wikipedia.org/wiki/Perceptron and references therein.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.linear_model import Perceptron
|
||||
>>> X, y = load_digits(return_X_y=True)
|
||||
>>> clf = Perceptron(tol=1e-3, random_state=0)
|
||||
>>> clf.fit(X, y)
|
||||
Perceptron()
|
||||
>>> clf.score(X, y)
|
||||
0.939...
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {**BaseSGDClassifier._parameter_constraints}
|
||||
_parameter_constraints.pop("loss")
|
||||
_parameter_constraints.pop("average")
|
||||
_parameter_constraints.update(
|
||||
{
|
||||
"penalty": [StrOptions({"l2", "l1", "elasticnet"}), None],
|
||||
"alpha": [Interval(Real, 0, None, closed="left")],
|
||||
"l1_ratio": [Interval(Real, 0, 1, closed="both")],
|
||||
"eta0": [Interval(Real, 0, None, closed="left")],
|
||||
}
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
penalty=None,
|
||||
alpha=0.0001,
|
||||
l1_ratio=0.15,
|
||||
fit_intercept=True,
|
||||
max_iter=1000,
|
||||
tol=1e-3,
|
||||
shuffle=True,
|
||||
verbose=0,
|
||||
eta0=1.0,
|
||||
n_jobs=None,
|
||||
random_state=0,
|
||||
early_stopping=False,
|
||||
validation_fraction=0.1,
|
||||
n_iter_no_change=5,
|
||||
class_weight=None,
|
||||
warm_start=False,
|
||||
):
|
||||
super().__init__(
|
||||
loss="perceptron",
|
||||
penalty=penalty,
|
||||
alpha=alpha,
|
||||
l1_ratio=l1_ratio,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
shuffle=shuffle,
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
learning_rate="constant",
|
||||
eta0=eta0,
|
||||
early_stopping=early_stopping,
|
||||
validation_fraction=validation_fraction,
|
||||
n_iter_no_change=n_iter_no_change,
|
||||
power_t=0.5,
|
||||
warm_start=warm_start,
|
||||
class_weight=class_weight,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
@@ -0,0 +1,308 @@
|
||||
# Authors: David Dale <dale.david@mail.ru>
|
||||
# Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
import warnings
|
||||
from numbers import Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.optimize import linprog
|
||||
|
||||
from ..base import BaseEstimator, RegressorMixin, _fit_context
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..utils import _safe_indexing
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.fixes import parse_version, sp_version
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._base import LinearModel
|
||||
|
||||
|
||||
class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
|
||||
"""Linear regression model that predicts conditional quantiles.
|
||||
|
||||
The linear :class:`QuantileRegressor` optimizes the pinball loss for a
|
||||
desired `quantile` and is robust to outliers.
|
||||
|
||||
This model uses an L1 regularization like
|
||||
:class:`~sklearn.linear_model.Lasso`.
|
||||
|
||||
Read more in the :ref:`User Guide <quantile_regression>`.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
quantile : float, default=0.5
|
||||
The quantile that the model tries to predict. It must be strictly
|
||||
between 0 and 1. If 0.5 (default), the model predicts the 50%
|
||||
quantile, i.e. the median.
|
||||
|
||||
alpha : float, default=1.0
|
||||
Regularization constant that multiplies the L1 penalty term.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether or not to fit the intercept.
|
||||
|
||||
solver : {'highs-ds', 'highs-ipm', 'highs', 'interior-point', \
|
||||
'revised simplex'}, default='highs'
|
||||
Method used by :func:`scipy.optimize.linprog` to solve the linear
|
||||
programming formulation.
|
||||
|
||||
From `scipy>=1.6.0`, it is recommended to use the highs methods because
|
||||
they are the fastest ones. Solvers "highs-ds", "highs-ipm" and "highs"
|
||||
support sparse input data and, in fact, always convert to sparse csc.
|
||||
|
||||
From `scipy>=1.11.0`, "interior-point" is not available anymore.
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
The default of `solver` changed to `"highs"` in version 1.4.
|
||||
|
||||
solver_options : dict, default=None
|
||||
Additional parameters passed to :func:`scipy.optimize.linprog` as
|
||||
options. If `None` and if `solver='interior-point'`, then
|
||||
`{"lstsq": True}` is passed to :func:`scipy.optimize.linprog` for the
|
||||
sake of stability.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array of shape (n_features,)
|
||||
Estimated coefficients for the features.
|
||||
|
||||
intercept_ : float
|
||||
The intercept of the model, aka bias term.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations performed by the solver.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Lasso : The Lasso is a linear model that estimates sparse coefficients
|
||||
with l1 regularization.
|
||||
HuberRegressor : Linear regression model that is robust to outliers.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.linear_model import QuantileRegressor
|
||||
>>> import numpy as np
|
||||
>>> n_samples, n_features = 10, 2
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> y = rng.randn(n_samples)
|
||||
>>> X = rng.randn(n_samples, n_features)
|
||||
>>> # the two following lines are optional in practice
|
||||
>>> from sklearn.utils.fixes import sp_version, parse_version
|
||||
>>> solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
|
||||
>>> reg = QuantileRegressor(quantile=0.8, solver=solver).fit(X, y)
|
||||
>>> np.mean(y <= reg.predict(X))
|
||||
np.float64(0.8)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"quantile": [Interval(Real, 0, 1, closed="neither")],
|
||||
"alpha": [Interval(Real, 0, None, closed="left")],
|
||||
"fit_intercept": ["boolean"],
|
||||
"solver": [
|
||||
StrOptions(
|
||||
{
|
||||
"highs-ds",
|
||||
"highs-ipm",
|
||||
"highs",
|
||||
"interior-point",
|
||||
"revised simplex",
|
||||
}
|
||||
),
|
||||
],
|
||||
"solver_options": [dict, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
quantile=0.5,
|
||||
alpha=1.0,
|
||||
fit_intercept=True,
|
||||
solver="highs",
|
||||
solver_options=None,
|
||||
):
|
||||
self.quantile = quantile
|
||||
self.alpha = alpha
|
||||
self.fit_intercept = fit_intercept
|
||||
self.solver = solver
|
||||
self.solver_options = solver_options
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns self.
|
||||
"""
|
||||
X, y = self._validate_data(
|
||||
X,
|
||||
y,
|
||||
accept_sparse=["csc", "csr", "coo"],
|
||||
y_numeric=True,
|
||||
multi_output=False,
|
||||
)
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
n_features = X.shape[1]
|
||||
n_params = n_features
|
||||
|
||||
if self.fit_intercept:
|
||||
n_params += 1
|
||||
# Note that centering y and X with _preprocess_data does not work
|
||||
# for quantile regression.
|
||||
|
||||
# The objective is defined as 1/n * sum(pinball loss) + alpha * L1.
|
||||
# So we rescale the penalty term, which is equivalent.
|
||||
alpha = np.sum(sample_weight) * self.alpha
|
||||
|
||||
if self.solver in (
|
||||
"highs-ds",
|
||||
"highs-ipm",
|
||||
"highs",
|
||||
) and sp_version < parse_version("1.6.0"):
|
||||
raise ValueError(
|
||||
f"Solver {self.solver} is only available "
|
||||
f"with scipy>=1.6.0, got {sp_version}"
|
||||
)
|
||||
else:
|
||||
solver = self.solver
|
||||
|
||||
if solver == "interior-point" and sp_version >= parse_version("1.11.0"):
|
||||
raise ValueError(
|
||||
f"Solver {solver} is not anymore available in SciPy >= 1.11.0."
|
||||
)
|
||||
|
||||
if sparse.issparse(X) and solver not in ["highs", "highs-ds", "highs-ipm"]:
|
||||
raise ValueError(
|
||||
f"Solver {self.solver} does not support sparse X. "
|
||||
"Use solver 'highs' for example."
|
||||
)
|
||||
# make default solver more stable
|
||||
if self.solver_options is None and solver == "interior-point":
|
||||
solver_options = {"lstsq": True}
|
||||
else:
|
||||
solver_options = self.solver_options
|
||||
|
||||
# After rescaling alpha, the minimization problem is
|
||||
# min sum(pinball loss) + alpha * L1
|
||||
# Use linear programming formulation of quantile regression
|
||||
# min_x c x
|
||||
# A_eq x = b_eq
|
||||
# 0 <= x
|
||||
# x = (s0, s, t0, t, u, v) = slack variables >= 0
|
||||
# intercept = s0 - t0
|
||||
# coef = s - t
|
||||
# c = (0, alpha * 1_p, 0, alpha * 1_p, quantile * 1_n, (1-quantile) * 1_n)
|
||||
# residual = y - X@coef - intercept = u - v
|
||||
# A_eq = (1_n, X, -1_n, -X, diag(1_n), -diag(1_n))
|
||||
# b_eq = y
|
||||
# p = n_features
|
||||
# n = n_samples
|
||||
# 1_n = vector of length n with entries equal one
|
||||
# see https://stats.stackexchange.com/questions/384909/
|
||||
#
|
||||
# Filtering out zero sample weights from the beginning makes life
|
||||
# easier for the linprog solver.
|
||||
indices = np.nonzero(sample_weight)[0]
|
||||
n_indices = len(indices) # use n_mask instead of n_samples
|
||||
if n_indices < len(sample_weight):
|
||||
sample_weight = sample_weight[indices]
|
||||
X = _safe_indexing(X, indices)
|
||||
y = _safe_indexing(y, indices)
|
||||
c = np.concatenate(
|
||||
[
|
||||
np.full(2 * n_params, fill_value=alpha),
|
||||
sample_weight * self.quantile,
|
||||
sample_weight * (1 - self.quantile),
|
||||
]
|
||||
)
|
||||
if self.fit_intercept:
|
||||
# do not penalize the intercept
|
||||
c[0] = 0
|
||||
c[n_params] = 0
|
||||
|
||||
if solver in ["highs", "highs-ds", "highs-ipm"]:
|
||||
# Note that highs methods always use a sparse CSC memory layout internally,
|
||||
# even for optimization problems parametrized using dense numpy arrays.
|
||||
# Therefore, we work with CSC matrices as early as possible to limit
|
||||
# unnecessary repeated memory copies.
|
||||
eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
|
||||
if self.fit_intercept:
|
||||
ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
|
||||
A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
|
||||
else:
|
||||
A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
|
||||
else:
|
||||
eye = np.eye(n_indices)
|
||||
if self.fit_intercept:
|
||||
ones = np.ones((n_indices, 1))
|
||||
A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
|
||||
else:
|
||||
A_eq = np.concatenate([X, -X, eye, -eye], axis=1)
|
||||
|
||||
b_eq = y
|
||||
|
||||
result = linprog(
|
||||
c=c,
|
||||
A_eq=A_eq,
|
||||
b_eq=b_eq,
|
||||
method=solver,
|
||||
options=solver_options,
|
||||
)
|
||||
solution = result.x
|
||||
if not result.success:
|
||||
failure = {
|
||||
1: "Iteration limit reached.",
|
||||
2: "Problem appears to be infeasible.",
|
||||
3: "Problem appears to be unbounded.",
|
||||
4: "Numerical difficulties encountered.",
|
||||
}
|
||||
warnings.warn(
|
||||
"Linear programming for QuantileRegressor did not succeed.\n"
|
||||
f"Status is {result.status}: "
|
||||
+ failure.setdefault(result.status, "unknown reason")
|
||||
+ "\n"
|
||||
+ "Result message of linprog:\n"
|
||||
+ result.message,
|
||||
ConvergenceWarning,
|
||||
)
|
||||
|
||||
# positive slack - negative slack
|
||||
# solution is an array with (params_pos, params_neg, u, v)
|
||||
params = solution[:n_params] - solution[n_params : 2 * n_params]
|
||||
|
||||
self.n_iter_ = result.nit
|
||||
|
||||
if self.fit_intercept:
|
||||
self.coef_ = params[1:]
|
||||
self.intercept_ = params[0]
|
||||
else:
|
||||
self.coef_ = params
|
||||
self.intercept_ = 0.0
|
||||
return self
|
||||
@@ -0,0 +1,729 @@
|
||||
# Author: Johannes Schönberger
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
MetaEstimatorMixin,
|
||||
MultiOutputMixin,
|
||||
RegressorMixin,
|
||||
_fit_context,
|
||||
clone,
|
||||
)
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..utils import check_consistent_length, check_random_state
|
||||
from ..utils._bunch import Bunch
|
||||
from ..utils._param_validation import (
|
||||
HasMethods,
|
||||
Interval,
|
||||
Options,
|
||||
RealNotInt,
|
||||
StrOptions,
|
||||
)
|
||||
from ..utils.metadata_routing import (
|
||||
MetadataRouter,
|
||||
MethodMapping,
|
||||
_raise_for_params,
|
||||
_routing_enabled,
|
||||
process_routing,
|
||||
)
|
||||
from ..utils.random import sample_without_replacement
|
||||
from ..utils.validation import (
|
||||
_check_method_params,
|
||||
_check_sample_weight,
|
||||
_deprecate_positional_args,
|
||||
check_is_fitted,
|
||||
has_fit_parameter,
|
||||
)
|
||||
from ._base import LinearRegression
|
||||
|
||||
_EPSILON = np.spacing(1)
|
||||
|
||||
|
||||
def _dynamic_max_trials(n_inliers, n_samples, min_samples, probability):
|
||||
"""Determine number trials such that at least one outlier-free subset is
|
||||
sampled for the given inlier/outlier ratio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_inliers : int
|
||||
Number of inliers in the data.
|
||||
|
||||
n_samples : int
|
||||
Total number of samples in the data.
|
||||
|
||||
min_samples : int
|
||||
Minimum number of samples chosen randomly from original data.
|
||||
|
||||
probability : float
|
||||
Probability (confidence) that one outlier-free sample is generated.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trials : int
|
||||
Number of trials.
|
||||
|
||||
"""
|
||||
inlier_ratio = n_inliers / float(n_samples)
|
||||
nom = max(_EPSILON, 1 - probability)
|
||||
denom = max(_EPSILON, 1 - inlier_ratio**min_samples)
|
||||
if nom == 1:
|
||||
return 0
|
||||
if denom == 1:
|
||||
return float("inf")
|
||||
return abs(float(np.ceil(np.log(nom) / np.log(denom))))
|
||||
|
||||
|
||||
class RANSACRegressor(
|
||||
MetaEstimatorMixin,
|
||||
RegressorMixin,
|
||||
MultiOutputMixin,
|
||||
BaseEstimator,
|
||||
):
|
||||
"""RANSAC (RANdom SAmple Consensus) algorithm.
|
||||
|
||||
RANSAC is an iterative algorithm for the robust estimation of parameters
|
||||
from a subset of inliers from the complete data set.
|
||||
|
||||
Read more in the :ref:`User Guide <ransac_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : object, default=None
|
||||
Base estimator object which implements the following methods:
|
||||
|
||||
* `fit(X, y)`: Fit model to given training data and target values.
|
||||
* `score(X, y)`: Returns the mean accuracy on the given test data,
|
||||
which is used for the stop criterion defined by `stop_score`.
|
||||
Additionally, the score is used to decide which of two equally
|
||||
large consensus sets is chosen as the better one.
|
||||
* `predict(X)`: Returns predicted values using the linear model,
|
||||
which is used to compute residual error using loss function.
|
||||
|
||||
If `estimator` is None, then
|
||||
:class:`~sklearn.linear_model.LinearRegression` is used for
|
||||
target values of dtype float.
|
||||
|
||||
Note that the current implementation only supports regression
|
||||
estimators.
|
||||
|
||||
min_samples : int (>= 1) or float ([0, 1]), default=None
|
||||
Minimum number of samples chosen randomly from original data. Treated
|
||||
as an absolute number of samples for `min_samples >= 1`, treated as a
|
||||
relative number `ceil(min_samples * X.shape[0])` for
|
||||
`min_samples < 1`. This is typically chosen as the minimal number of
|
||||
samples necessary to estimate the given `estimator`. By default a
|
||||
:class:`~sklearn.linear_model.LinearRegression` estimator is assumed and
|
||||
`min_samples` is chosen as ``X.shape[1] + 1``. This parameter is highly
|
||||
dependent upon the model, so if a `estimator` other than
|
||||
:class:`~sklearn.linear_model.LinearRegression` is used, the user must
|
||||
provide a value.
|
||||
|
||||
residual_threshold : float, default=None
|
||||
Maximum residual for a data sample to be classified as an inlier.
|
||||
By default the threshold is chosen as the MAD (median absolute
|
||||
deviation) of the target values `y`. Points whose residuals are
|
||||
strictly equal to the threshold are considered as inliers.
|
||||
|
||||
is_data_valid : callable, default=None
|
||||
This function is called with the randomly selected data before the
|
||||
model is fitted to it: `is_data_valid(X, y)`. If its return value is
|
||||
False the current randomly chosen sub-sample is skipped.
|
||||
|
||||
is_model_valid : callable, default=None
|
||||
This function is called with the estimated model and the randomly
|
||||
selected data: `is_model_valid(model, X, y)`. If its return value is
|
||||
False the current randomly chosen sub-sample is skipped.
|
||||
Rejecting samples with this function is computationally costlier than
|
||||
with `is_data_valid`. `is_model_valid` should therefore only be used if
|
||||
the estimated model is needed for making the rejection decision.
|
||||
|
||||
max_trials : int, default=100
|
||||
Maximum number of iterations for random sample selection.
|
||||
|
||||
max_skips : int, default=np.inf
|
||||
Maximum number of iterations that can be skipped due to finding zero
|
||||
inliers or invalid data defined by ``is_data_valid`` or invalid models
|
||||
defined by ``is_model_valid``.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
stop_n_inliers : int, default=np.inf
|
||||
Stop iteration if at least this number of inliers are found.
|
||||
|
||||
stop_score : float, default=np.inf
|
||||
Stop iteration if score is greater equal than this threshold.
|
||||
|
||||
stop_probability : float in range [0, 1], default=0.99
|
||||
RANSAC iteration stops if at least one outlier-free set of the training
|
||||
data is sampled in RANSAC. This requires to generate at least N
|
||||
samples (iterations)::
|
||||
|
||||
N >= log(1 - probability) / log(1 - e**m)
|
||||
|
||||
where the probability (confidence) is typically set to high value such
|
||||
as 0.99 (the default) and e is the current fraction of inliers w.r.t.
|
||||
the total number of samples.
|
||||
|
||||
loss : str, callable, default='absolute_error'
|
||||
String inputs, 'absolute_error' and 'squared_error' are supported which
|
||||
find the absolute error and squared error per sample respectively.
|
||||
|
||||
If ``loss`` is a callable, then it should be a function that takes
|
||||
two arrays as inputs, the true and predicted value and returns a 1-D
|
||||
array with the i-th value of the array corresponding to the loss
|
||||
on ``X[i]``.
|
||||
|
||||
If the loss on a sample is greater than the ``residual_threshold``,
|
||||
then this sample is classified as an outlier.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
The generator used to initialize the centers.
|
||||
Pass an int for reproducible output across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
estimator_ : object
|
||||
Best fitted model (copy of the `estimator` object).
|
||||
|
||||
n_trials_ : int
|
||||
Number of random selection trials until one of the stop criteria is
|
||||
met. It is always ``<= max_trials``.
|
||||
|
||||
inlier_mask_ : bool array of shape [n_samples]
|
||||
Boolean mask of inliers classified as ``True``.
|
||||
|
||||
n_skips_no_inliers_ : int
|
||||
Number of iterations skipped due to finding zero inliers.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
n_skips_invalid_data_ : int
|
||||
Number of iterations skipped due to invalid data defined by
|
||||
``is_data_valid``.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
n_skips_invalid_model_ : int
|
||||
Number of iterations skipped due to an invalid model defined by
|
||||
``is_model_valid``.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
HuberRegressor : Linear regression model that is robust to outliers.
|
||||
TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
|
||||
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] https://en.wikipedia.org/wiki/RANSAC
|
||||
.. [2] https://www.sri.com/wp-content/uploads/2021/12/ransac-publication.pdf
|
||||
.. [3] http://www.bmva.org/bmvc/2009/Papers/Paper355/Paper355.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.linear_model import RANSACRegressor
|
||||
>>> from sklearn.datasets import make_regression
|
||||
>>> X, y = make_regression(
|
||||
... n_samples=200, n_features=2, noise=4.0, random_state=0)
|
||||
>>> reg = RANSACRegressor(random_state=0).fit(X, y)
|
||||
>>> reg.score(X, y)
|
||||
0.9885...
|
||||
>>> reg.predict(X[:1,])
|
||||
array([-31.9417...])
|
||||
|
||||
For a more detailed example, see
|
||||
:ref:`sphx_glr_auto_examples_linear_model_plot_ransac.py`
|
||||
""" # noqa: E501
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"estimator": [HasMethods(["fit", "score", "predict"]), None],
|
||||
"min_samples": [
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
Interval(RealNotInt, 0, 1, closed="both"),
|
||||
None,
|
||||
],
|
||||
"residual_threshold": [Interval(Real, 0, None, closed="left"), None],
|
||||
"is_data_valid": [callable, None],
|
||||
"is_model_valid": [callable, None],
|
||||
"max_trials": [
|
||||
Interval(Integral, 0, None, closed="left"),
|
||||
Options(Real, {np.inf}),
|
||||
],
|
||||
"max_skips": [
|
||||
Interval(Integral, 0, None, closed="left"),
|
||||
Options(Real, {np.inf}),
|
||||
],
|
||||
"stop_n_inliers": [
|
||||
Interval(Integral, 0, None, closed="left"),
|
||||
Options(Real, {np.inf}),
|
||||
],
|
||||
"stop_score": [Interval(Real, None, None, closed="both")],
|
||||
"stop_probability": [Interval(Real, 0, 1, closed="both")],
|
||||
"loss": [StrOptions({"absolute_error", "squared_error"}), callable],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimator=None,
|
||||
*,
|
||||
min_samples=None,
|
||||
residual_threshold=None,
|
||||
is_data_valid=None,
|
||||
is_model_valid=None,
|
||||
max_trials=100,
|
||||
max_skips=np.inf,
|
||||
stop_n_inliers=np.inf,
|
||||
stop_score=np.inf,
|
||||
stop_probability=0.99,
|
||||
loss="absolute_error",
|
||||
random_state=None,
|
||||
):
|
||||
self.estimator = estimator
|
||||
self.min_samples = min_samples
|
||||
self.residual_threshold = residual_threshold
|
||||
self.is_data_valid = is_data_valid
|
||||
self.is_model_valid = is_model_valid
|
||||
self.max_trials = max_trials
|
||||
self.max_skips = max_skips
|
||||
self.stop_n_inliers = stop_n_inliers
|
||||
self.stop_score = stop_score
|
||||
self.stop_probability = stop_probability
|
||||
self.random_state = random_state
|
||||
self.loss = loss
|
||||
|
||||
@_fit_context(
|
||||
# RansacRegressor.estimator is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
# TODO(1.7): remove `sample_weight` from the signature after deprecation
|
||||
# cycle; for backwards compatibility: pop it from `fit_params` before the
|
||||
# `_raise_for_params` check and reinsert it after the check
|
||||
@_deprecate_positional_args(version="1.7")
|
||||
def fit(self, X, y, *, sample_weight=None, **fit_params):
|
||||
"""Fit estimator using RANSAC algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_targets)
|
||||
Target values.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Individual weights for each sample
|
||||
raises error if sample_weight is passed and estimator
|
||||
fit method does not support it.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
**fit_params : dict
|
||||
Parameters routed to the `fit` method of the sub-estimator via the
|
||||
metadata routing API.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Only available if
|
||||
`sklearn.set_config(enable_metadata_routing=True)` is set. See
|
||||
:ref:`Metadata Routing User Guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted `RANSACRegressor` estimator.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If no valid consensus set could be found. This occurs if
|
||||
`is_data_valid` and `is_model_valid` return False for all
|
||||
`max_trials` randomly chosen sub-samples.
|
||||
"""
|
||||
# Need to validate separately here. We can't pass multi_output=True
|
||||
# because that would allow y to be csr. Delay expensive finiteness
|
||||
# check to the estimator's own input validation.
|
||||
_raise_for_params(fit_params, self, "fit")
|
||||
check_X_params = dict(accept_sparse="csr", force_all_finite=False)
|
||||
check_y_params = dict(ensure_2d=False)
|
||||
X, y = self._validate_data(
|
||||
X, y, validate_separately=(check_X_params, check_y_params)
|
||||
)
|
||||
check_consistent_length(X, y)
|
||||
|
||||
if self.estimator is not None:
|
||||
estimator = clone(self.estimator)
|
||||
else:
|
||||
estimator = LinearRegression()
|
||||
|
||||
if self.min_samples is None:
|
||||
if not isinstance(estimator, LinearRegression):
|
||||
raise ValueError(
|
||||
"`min_samples` needs to be explicitly set when estimator "
|
||||
"is not a LinearRegression."
|
||||
)
|
||||
min_samples = X.shape[1] + 1
|
||||
elif 0 < self.min_samples < 1:
|
||||
min_samples = np.ceil(self.min_samples * X.shape[0])
|
||||
elif self.min_samples >= 1:
|
||||
min_samples = self.min_samples
|
||||
if min_samples > X.shape[0]:
|
||||
raise ValueError(
|
||||
"`min_samples` may not be larger than number "
|
||||
"of samples: n_samples = %d." % (X.shape[0])
|
||||
)
|
||||
|
||||
if self.residual_threshold is None:
|
||||
# MAD (median absolute deviation)
|
||||
residual_threshold = np.median(np.abs(y - np.median(y)))
|
||||
else:
|
||||
residual_threshold = self.residual_threshold
|
||||
|
||||
if self.loss == "absolute_error":
|
||||
if y.ndim == 1:
|
||||
loss_function = lambda y_true, y_pred: np.abs(y_true - y_pred)
|
||||
else:
|
||||
loss_function = lambda y_true, y_pred: np.sum(
|
||||
np.abs(y_true - y_pred), axis=1
|
||||
)
|
||||
elif self.loss == "squared_error":
|
||||
if y.ndim == 1:
|
||||
loss_function = lambda y_true, y_pred: (y_true - y_pred) ** 2
|
||||
else:
|
||||
loss_function = lambda y_true, y_pred: np.sum(
|
||||
(y_true - y_pred) ** 2, axis=1
|
||||
)
|
||||
|
||||
elif callable(self.loss):
|
||||
loss_function = self.loss
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
try: # Not all estimator accept a random_state
|
||||
estimator.set_params(random_state=random_state)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
estimator_fit_has_sample_weight = has_fit_parameter(estimator, "sample_weight")
|
||||
estimator_name = type(estimator).__name__
|
||||
if sample_weight is not None and not estimator_fit_has_sample_weight:
|
||||
raise ValueError(
|
||||
"%s does not support sample_weight. Sample"
|
||||
" weights are only used for the calibration"
|
||||
" itself." % estimator_name
|
||||
)
|
||||
|
||||
if sample_weight is not None:
|
||||
fit_params["sample_weight"] = sample_weight
|
||||
|
||||
if _routing_enabled():
|
||||
routed_params = process_routing(self, "fit", **fit_params)
|
||||
else:
|
||||
routed_params = Bunch()
|
||||
routed_params.estimator = Bunch(fit={}, predict={}, score={})
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
routed_params.estimator.fit = {"sample_weight": sample_weight}
|
||||
|
||||
n_inliers_best = 1
|
||||
score_best = -np.inf
|
||||
inlier_mask_best = None
|
||||
X_inlier_best = None
|
||||
y_inlier_best = None
|
||||
inlier_best_idxs_subset = None
|
||||
self.n_skips_no_inliers_ = 0
|
||||
self.n_skips_invalid_data_ = 0
|
||||
self.n_skips_invalid_model_ = 0
|
||||
|
||||
# number of data samples
|
||||
n_samples = X.shape[0]
|
||||
sample_idxs = np.arange(n_samples)
|
||||
|
||||
self.n_trials_ = 0
|
||||
max_trials = self.max_trials
|
||||
while self.n_trials_ < max_trials:
|
||||
self.n_trials_ += 1
|
||||
|
||||
if (
|
||||
self.n_skips_no_inliers_
|
||||
+ self.n_skips_invalid_data_
|
||||
+ self.n_skips_invalid_model_
|
||||
) > self.max_skips:
|
||||
break
|
||||
|
||||
# choose random sample set
|
||||
subset_idxs = sample_without_replacement(
|
||||
n_samples, min_samples, random_state=random_state
|
||||
)
|
||||
X_subset = X[subset_idxs]
|
||||
y_subset = y[subset_idxs]
|
||||
|
||||
# check if random sample set is valid
|
||||
if self.is_data_valid is not None and not self.is_data_valid(
|
||||
X_subset, y_subset
|
||||
):
|
||||
self.n_skips_invalid_data_ += 1
|
||||
continue
|
||||
|
||||
# cut `fit_params` down to `subset_idxs`
|
||||
fit_params_subset = _check_method_params(
|
||||
X, params=routed_params.estimator.fit, indices=subset_idxs
|
||||
)
|
||||
|
||||
# fit model for current random sample set
|
||||
estimator.fit(X_subset, y_subset, **fit_params_subset)
|
||||
|
||||
# check if estimated model is valid
|
||||
if self.is_model_valid is not None and not self.is_model_valid(
|
||||
estimator, X_subset, y_subset
|
||||
):
|
||||
self.n_skips_invalid_model_ += 1
|
||||
continue
|
||||
|
||||
# residuals of all data for current random sample model
|
||||
y_pred = estimator.predict(X)
|
||||
residuals_subset = loss_function(y, y_pred)
|
||||
|
||||
# classify data into inliers and outliers
|
||||
inlier_mask_subset = residuals_subset <= residual_threshold
|
||||
n_inliers_subset = np.sum(inlier_mask_subset)
|
||||
|
||||
# less inliers -> skip current random sample
|
||||
if n_inliers_subset < n_inliers_best:
|
||||
self.n_skips_no_inliers_ += 1
|
||||
continue
|
||||
|
||||
# extract inlier data set
|
||||
inlier_idxs_subset = sample_idxs[inlier_mask_subset]
|
||||
X_inlier_subset = X[inlier_idxs_subset]
|
||||
y_inlier_subset = y[inlier_idxs_subset]
|
||||
|
||||
# cut `fit_params` down to `inlier_idxs_subset`
|
||||
score_params_inlier_subset = _check_method_params(
|
||||
X, params=routed_params.estimator.score, indices=inlier_idxs_subset
|
||||
)
|
||||
|
||||
# score of inlier data set
|
||||
score_subset = estimator.score(
|
||||
X_inlier_subset,
|
||||
y_inlier_subset,
|
||||
**score_params_inlier_subset,
|
||||
)
|
||||
|
||||
# same number of inliers but worse score -> skip current random
|
||||
# sample
|
||||
if n_inliers_subset == n_inliers_best and score_subset < score_best:
|
||||
continue
|
||||
|
||||
# save current random sample as best sample
|
||||
n_inliers_best = n_inliers_subset
|
||||
score_best = score_subset
|
||||
inlier_mask_best = inlier_mask_subset
|
||||
X_inlier_best = X_inlier_subset
|
||||
y_inlier_best = y_inlier_subset
|
||||
inlier_best_idxs_subset = inlier_idxs_subset
|
||||
|
||||
max_trials = min(
|
||||
max_trials,
|
||||
_dynamic_max_trials(
|
||||
n_inliers_best, n_samples, min_samples, self.stop_probability
|
||||
),
|
||||
)
|
||||
|
||||
# break if sufficient number of inliers or score is reached
|
||||
if n_inliers_best >= self.stop_n_inliers or score_best >= self.stop_score:
|
||||
break
|
||||
|
||||
# if none of the iterations met the required criteria
|
||||
if inlier_mask_best is None:
|
||||
if (
|
||||
self.n_skips_no_inliers_
|
||||
+ self.n_skips_invalid_data_
|
||||
+ self.n_skips_invalid_model_
|
||||
) > self.max_skips:
|
||||
raise ValueError(
|
||||
"RANSAC skipped more iterations than `max_skips` without"
|
||||
" finding a valid consensus set. Iterations were skipped"
|
||||
" because each randomly chosen sub-sample failed the"
|
||||
" passing criteria. See estimator attributes for"
|
||||
" diagnostics (n_skips*)."
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"RANSAC could not find a valid consensus set. All"
|
||||
" `max_trials` iterations were skipped because each"
|
||||
" randomly chosen sub-sample failed the passing criteria."
|
||||
" See estimator attributes for diagnostics (n_skips*)."
|
||||
)
|
||||
else:
|
||||
if (
|
||||
self.n_skips_no_inliers_
|
||||
+ self.n_skips_invalid_data_
|
||||
+ self.n_skips_invalid_model_
|
||||
) > self.max_skips:
|
||||
warnings.warn(
|
||||
(
|
||||
"RANSAC found a valid consensus set but exited"
|
||||
" early due to skipping more iterations than"
|
||||
" `max_skips`. See estimator attributes for"
|
||||
" diagnostics (n_skips*)."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
|
||||
# estimate final model using all inliers
|
||||
fit_params_best_idxs_subset = _check_method_params(
|
||||
X, params=routed_params.estimator.fit, indices=inlier_best_idxs_subset
|
||||
)
|
||||
|
||||
estimator.fit(X_inlier_best, y_inlier_best, **fit_params_best_idxs_subset)
|
||||
|
||||
self.estimator_ = estimator
|
||||
self.inlier_mask_ = inlier_mask_best
|
||||
return self
|
||||
|
||||
def predict(self, X, **params):
|
||||
"""Predict using the estimated model.
|
||||
|
||||
This is a wrapper for `estimator_.predict(X)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like or sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
**params : dict
|
||||
Parameters routed to the `predict` method of the sub-estimator via
|
||||
the metadata routing API.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Only available if
|
||||
`sklearn.set_config(enable_metadata_routing=True)` is set. See
|
||||
:ref:`Metadata Routing User Guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : array, shape = [n_samples] or [n_samples, n_targets]
|
||||
Returns predicted values.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(
|
||||
X,
|
||||
force_all_finite=False,
|
||||
accept_sparse=True,
|
||||
reset=False,
|
||||
)
|
||||
|
||||
_raise_for_params(params, self, "predict")
|
||||
|
||||
if _routing_enabled():
|
||||
predict_params = process_routing(self, "predict", **params).estimator[
|
||||
"predict"
|
||||
]
|
||||
else:
|
||||
predict_params = {}
|
||||
|
||||
return self.estimator_.predict(X, **predict_params)
|
||||
|
||||
def score(self, X, y, **params):
|
||||
"""Return the score of the prediction.
|
||||
|
||||
This is a wrapper for `estimator_.score(X, y)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : (array-like or sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_targets)
|
||||
Target values.
|
||||
|
||||
**params : dict
|
||||
Parameters routed to the `score` method of the sub-estimator via
|
||||
the metadata routing API.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Only available if
|
||||
`sklearn.set_config(enable_metadata_routing=True)` is set. See
|
||||
:ref:`Metadata Routing User Guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
z : float
|
||||
Score of the prediction.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(
|
||||
X,
|
||||
force_all_finite=False,
|
||||
accept_sparse=True,
|
||||
reset=False,
|
||||
)
|
||||
|
||||
_raise_for_params(params, self, "score")
|
||||
if _routing_enabled():
|
||||
score_params = process_routing(self, "score", **params).estimator["score"]
|
||||
else:
|
||||
score_params = {}
|
||||
|
||||
return self.estimator_.score(X, y, **score_params)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
"""Get metadata routing of this object.
|
||||
|
||||
Please check :ref:`User Guide <metadata_routing>` on how the routing
|
||||
mechanism works.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Returns
|
||||
-------
|
||||
routing : MetadataRouter
|
||||
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
|
||||
routing information.
|
||||
"""
|
||||
router = MetadataRouter(owner=self.__class__.__name__).add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=MethodMapping()
|
||||
.add(caller="fit", callee="fit")
|
||||
.add(caller="fit", callee="score")
|
||||
.add(caller="score", callee="score")
|
||||
.add(caller="predict", callee="predict"),
|
||||
)
|
||||
return router
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"_xfail_checks": {
|
||||
"check_sample_weights_invariance": (
|
||||
"zero sample_weight is not equivalent to removing samples"
|
||||
),
|
||||
}
|
||||
}
|
||||
2893
.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py
Normal file
2893
.venv/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py
Normal file
File diff suppressed because it is too large
Load Diff
371
.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py
Normal file
371
.venv/lib/python3.12/site-packages/sklearn/linear_model/_sag.py
Normal file
@@ -0,0 +1,371 @@
|
||||
"""Solvers for Ridge and LogisticRegression using SAG algorithm"""
|
||||
|
||||
# Authors: Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..utils import check_array
|
||||
from ..utils.extmath import row_norms
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._base import make_dataset
|
||||
from ._sag_fast import sag32, sag64
|
||||
|
||||
|
||||
def get_auto_step_size(
|
||||
max_squared_sum, alpha_scaled, loss, fit_intercept, n_samples=None, is_saga=False
|
||||
):
|
||||
"""Compute automatic step size for SAG solver.
|
||||
|
||||
The step size is set to 1 / (alpha_scaled + L + fit_intercept) where L is
|
||||
the max sum of squares for over all samples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_squared_sum : float
|
||||
Maximum squared sum of X over samples.
|
||||
|
||||
alpha_scaled : float
|
||||
Constant that multiplies the regularization term, scaled by
|
||||
1. / n_samples, the number of samples.
|
||||
|
||||
loss : {'log', 'squared', 'multinomial'}
|
||||
The loss function used in SAG solver.
|
||||
|
||||
fit_intercept : bool
|
||||
Specifies if a constant (a.k.a. bias or intercept) will be
|
||||
added to the decision function.
|
||||
|
||||
n_samples : int, default=None
|
||||
Number of rows in X. Useful if is_saga=True.
|
||||
|
||||
is_saga : bool, default=False
|
||||
Whether to return step size for the SAGA algorithm or the SAG
|
||||
algorithm.
|
||||
|
||||
Returns
|
||||
-------
|
||||
step_size : float
|
||||
Step size used in SAG solver.
|
||||
|
||||
References
|
||||
----------
|
||||
Schmidt, M., Roux, N. L., & Bach, F. (2013).
|
||||
Minimizing finite sums with the stochastic average gradient
|
||||
https://hal.inria.fr/hal-00860051/document
|
||||
|
||||
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
|
||||
"SAGA: A Fast Incremental Gradient Method With Support
|
||||
for Non-Strongly Convex Composite Objectives" <1407.0202>`
|
||||
"""
|
||||
if loss in ("log", "multinomial"):
|
||||
L = 0.25 * (max_squared_sum + int(fit_intercept)) + alpha_scaled
|
||||
elif loss == "squared":
|
||||
# inverse Lipschitz constant for squared loss
|
||||
L = max_squared_sum + int(fit_intercept) + alpha_scaled
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown loss function for SAG solver, got %s instead of 'log' or 'squared'"
|
||||
% loss
|
||||
)
|
||||
if is_saga:
|
||||
# SAGA theoretical step size is 1/3L or 1 / (2 * (L + mu n))
|
||||
# See Defazio et al. 2014
|
||||
mun = min(2 * n_samples * alpha_scaled, L)
|
||||
step = 1.0 / (2 * L + mun)
|
||||
else:
|
||||
# SAG theoretical step size is 1/16L but it is recommended to use 1 / L
|
||||
# see http://www.birs.ca//workshops//2014/14w5003/files/schmidt.pdf,
|
||||
# slide 65
|
||||
step = 1.0 / L
|
||||
return step
|
||||
|
||||
|
||||
def sag_solver(
|
||||
X,
|
||||
y,
|
||||
sample_weight=None,
|
||||
loss="log",
|
||||
alpha=1.0,
|
||||
beta=0.0,
|
||||
max_iter=1000,
|
||||
tol=0.001,
|
||||
verbose=0,
|
||||
random_state=None,
|
||||
check_input=True,
|
||||
max_squared_sum=None,
|
||||
warm_start_mem=None,
|
||||
is_saga=False,
|
||||
):
|
||||
"""SAG solver for Ridge and LogisticRegression.
|
||||
|
||||
SAG stands for Stochastic Average Gradient: the gradient of the loss is
|
||||
estimated each sample at a time and the model is updated along the way with
|
||||
a constant learning rate.
|
||||
|
||||
IMPORTANT NOTE: 'sag' solver converges faster on columns that are on the
|
||||
same scale. You can normalize the data by using
|
||||
sklearn.preprocessing.StandardScaler on your data before passing it to the
|
||||
fit method.
|
||||
|
||||
This implementation works with data represented as dense numpy arrays or
|
||||
sparse scipy arrays of floating point values for the features. It will
|
||||
fit the data according to squared loss or log loss.
|
||||
|
||||
The regularizer is a penalty added to the loss function that shrinks model
|
||||
parameters towards the zero vector using the squared euclidean norm L2.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values. With loss='multinomial', y must be label encoded
|
||||
(see preprocessing.LabelEncoder).
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weights applied to individual samples (1. for unweighted).
|
||||
|
||||
loss : {'log', 'squared', 'multinomial'}, default='log'
|
||||
Loss function that will be optimized:
|
||||
-'log' is the binary logistic loss, as used in LogisticRegression.
|
||||
-'squared' is the squared loss, as used in Ridge.
|
||||
-'multinomial' is the multinomial logistic loss, as used in
|
||||
LogisticRegression.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
*loss='multinomial'*
|
||||
|
||||
alpha : float, default=1.
|
||||
L2 regularization term in the objective function
|
||||
``(0.5 * alpha * || W ||_F^2)``.
|
||||
|
||||
beta : float, default=0.
|
||||
L1 regularization term in the objective function
|
||||
``(beta * || W ||_1)``. Only applied if ``is_saga`` is set to True.
|
||||
|
||||
max_iter : int, default=1000
|
||||
The max number of passes over the training data if the stopping
|
||||
criteria is not reached.
|
||||
|
||||
tol : float, default=0.001
|
||||
The stopping criteria for the weights. The iterations will stop when
|
||||
max(change in weights) / max(weights) < tol.
|
||||
|
||||
verbose : int, default=0
|
||||
The verbosity level.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Used when shuffling the data. Pass an int for reproducible output
|
||||
across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
check_input : bool, default=True
|
||||
If False, the input arrays X and y will not be checked.
|
||||
|
||||
max_squared_sum : float, default=None
|
||||
Maximum squared sum of X over samples. If None, it will be computed,
|
||||
going through all the samples. The value should be precomputed
|
||||
to speed up cross validation.
|
||||
|
||||
warm_start_mem : dict, default=None
|
||||
The initialization parameters used for warm starting. Warm starting is
|
||||
currently used in LogisticRegression but not in Ridge.
|
||||
It contains:
|
||||
- 'coef': the weight vector, with the intercept in last line
|
||||
if the intercept is fitted.
|
||||
- 'gradient_memory': the scalar gradient for all seen samples.
|
||||
- 'sum_gradient': the sum of gradient over all seen samples,
|
||||
for each feature.
|
||||
- 'intercept_sum_gradient': the sum of gradient over all seen
|
||||
samples, for the intercept.
|
||||
- 'seen': array of boolean describing the seen samples.
|
||||
- 'num_seen': the number of seen samples.
|
||||
|
||||
is_saga : bool, default=False
|
||||
Whether to use the SAGA algorithm or the SAG algorithm. SAGA behaves
|
||||
better in the first epochs, and allow for l1 regularisation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
coef_ : ndarray of shape (n_features,)
|
||||
Weight vector.
|
||||
|
||||
n_iter_ : int
|
||||
The number of full pass on all samples.
|
||||
|
||||
warm_start_mem : dict
|
||||
Contains a 'coef' key with the fitted result, and possibly the
|
||||
fitted intercept at the end of the array. Contains also other keys
|
||||
used for warm starting.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import linear_model
|
||||
>>> n_samples, n_features = 10, 5
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.randn(n_samples, n_features)
|
||||
>>> y = rng.randn(n_samples)
|
||||
>>> clf = linear_model.Ridge(solver='sag')
|
||||
>>> clf.fit(X, y)
|
||||
Ridge(solver='sag')
|
||||
|
||||
>>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
|
||||
>>> y = np.array([1, 1, 2, 2])
|
||||
>>> clf = linear_model.LogisticRegression(solver='sag')
|
||||
>>> clf.fit(X, y)
|
||||
LogisticRegression(solver='sag')
|
||||
|
||||
References
|
||||
----------
|
||||
Schmidt, M., Roux, N. L., & Bach, F. (2013).
|
||||
Minimizing finite sums with the stochastic average gradient
|
||||
https://hal.inria.fr/hal-00860051/document
|
||||
|
||||
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
|
||||
"SAGA: A Fast Incremental Gradient Method With Support
|
||||
for Non-Strongly Convex Composite Objectives" <1407.0202>`
|
||||
|
||||
See Also
|
||||
--------
|
||||
Ridge, SGDRegressor, ElasticNet, Lasso, SVR,
|
||||
LogisticRegression, SGDClassifier, LinearSVC, Perceptron
|
||||
"""
|
||||
if warm_start_mem is None:
|
||||
warm_start_mem = {}
|
||||
# Ridge default max_iter is None
|
||||
if max_iter is None:
|
||||
max_iter = 1000
|
||||
|
||||
if check_input:
|
||||
_dtype = [np.float64, np.float32]
|
||||
X = check_array(X, dtype=_dtype, accept_sparse="csr", order="C")
|
||||
y = check_array(y, dtype=_dtype, ensure_2d=False, order="C")
|
||||
|
||||
n_samples, n_features = X.shape[0], X.shape[1]
|
||||
# As in SGD, the alpha is scaled by n_samples.
|
||||
alpha_scaled = float(alpha) / n_samples
|
||||
beta_scaled = float(beta) / n_samples
|
||||
|
||||
# if loss == 'multinomial', y should be label encoded.
|
||||
n_classes = int(y.max()) + 1 if loss == "multinomial" else 1
|
||||
|
||||
# initialization
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
|
||||
|
||||
if "coef" in warm_start_mem.keys():
|
||||
coef_init = warm_start_mem["coef"]
|
||||
else:
|
||||
# assume fit_intercept is False
|
||||
coef_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
|
||||
|
||||
# coef_init contains possibly the intercept_init at the end.
|
||||
# Note that Ridge centers the data before fitting, so fit_intercept=False.
|
||||
fit_intercept = coef_init.shape[0] == (n_features + 1)
|
||||
if fit_intercept:
|
||||
intercept_init = coef_init[-1, :]
|
||||
coef_init = coef_init[:-1, :]
|
||||
else:
|
||||
intercept_init = np.zeros(n_classes, dtype=X.dtype)
|
||||
|
||||
if "intercept_sum_gradient" in warm_start_mem.keys():
|
||||
intercept_sum_gradient = warm_start_mem["intercept_sum_gradient"]
|
||||
else:
|
||||
intercept_sum_gradient = np.zeros(n_classes, dtype=X.dtype)
|
||||
|
||||
if "gradient_memory" in warm_start_mem.keys():
|
||||
gradient_memory_init = warm_start_mem["gradient_memory"]
|
||||
else:
|
||||
gradient_memory_init = np.zeros(
|
||||
(n_samples, n_classes), dtype=X.dtype, order="C"
|
||||
)
|
||||
if "sum_gradient" in warm_start_mem.keys():
|
||||
sum_gradient_init = warm_start_mem["sum_gradient"]
|
||||
else:
|
||||
sum_gradient_init = np.zeros((n_features, n_classes), dtype=X.dtype, order="C")
|
||||
|
||||
if "seen" in warm_start_mem.keys():
|
||||
seen_init = warm_start_mem["seen"]
|
||||
else:
|
||||
seen_init = np.zeros(n_samples, dtype=np.int32, order="C")
|
||||
|
||||
if "num_seen" in warm_start_mem.keys():
|
||||
num_seen_init = warm_start_mem["num_seen"]
|
||||
else:
|
||||
num_seen_init = 0
|
||||
|
||||
dataset, intercept_decay = make_dataset(X, y, sample_weight, random_state)
|
||||
|
||||
if max_squared_sum is None:
|
||||
max_squared_sum = row_norms(X, squared=True).max()
|
||||
step_size = get_auto_step_size(
|
||||
max_squared_sum,
|
||||
alpha_scaled,
|
||||
loss,
|
||||
fit_intercept,
|
||||
n_samples=n_samples,
|
||||
is_saga=is_saga,
|
||||
)
|
||||
if step_size * alpha_scaled == 1:
|
||||
raise ZeroDivisionError(
|
||||
"Current sag implementation does not handle "
|
||||
"the case step_size * alpha_scaled == 1"
|
||||
)
|
||||
|
||||
sag = sag64 if X.dtype == np.float64 else sag32
|
||||
num_seen, n_iter_ = sag(
|
||||
dataset,
|
||||
coef_init,
|
||||
intercept_init,
|
||||
n_samples,
|
||||
n_features,
|
||||
n_classes,
|
||||
tol,
|
||||
max_iter,
|
||||
loss,
|
||||
step_size,
|
||||
alpha_scaled,
|
||||
beta_scaled,
|
||||
sum_gradient_init,
|
||||
gradient_memory_init,
|
||||
seen_init,
|
||||
num_seen_init,
|
||||
fit_intercept,
|
||||
intercept_sum_gradient,
|
||||
intercept_decay,
|
||||
is_saga,
|
||||
verbose,
|
||||
)
|
||||
|
||||
if n_iter_ == max_iter:
|
||||
warnings.warn(
|
||||
"The max_iter was reached which means the coef_ did not converge",
|
||||
ConvergenceWarning,
|
||||
)
|
||||
|
||||
if fit_intercept:
|
||||
coef_init = np.vstack((coef_init, intercept_init))
|
||||
|
||||
warm_start_mem = {
|
||||
"coef": coef_init,
|
||||
"sum_gradient": sum_gradient_init,
|
||||
"intercept_sum_gradient": intercept_sum_gradient,
|
||||
"gradient_memory": gradient_memory_init,
|
||||
"seen": seen_init,
|
||||
"num_seen": num_seen,
|
||||
}
|
||||
|
||||
if loss == "multinomial":
|
||||
coef_ = coef_init.T
|
||||
else:
|
||||
coef_ = coef_init[:, 0]
|
||||
|
||||
return coef_, n_iter_, warm_start_mem
|
||||
Binary file not shown.
@@ -0,0 +1,842 @@
|
||||
{{py:
|
||||
|
||||
"""
|
||||
|
||||
Template file for easily generate fused types consistent code using Tempita
|
||||
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
|
||||
|
||||
Generated file: sag_fast.pyx
|
||||
|
||||
Each class is duplicated for all dtypes (float and double). The keywords
|
||||
between double braces are substituted in setup.py.
|
||||
|
||||
Authors: Danny Sullivan <dbsullivan23@gmail.com>
|
||||
Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
|
||||
Arthur Mensch <arthur.mensch@m4x.org
|
||||
Arthur Imbert <arthurimbert05@gmail.com>
|
||||
Joan Massich <mailsik@gmail.com>
|
||||
|
||||
License: BSD 3 clause
|
||||
"""
|
||||
|
||||
# name_suffix, c_type, np_type
|
||||
dtypes = [('64', 'double', 'np.float64'),
|
||||
('32', 'float', 'np.float32')]
|
||||
|
||||
}}
|
||||
"""SAG and SAGA implementation"""
|
||||
|
||||
import numpy as np
|
||||
from libc.math cimport exp, fabs, isfinite, log
|
||||
from libc.time cimport time, time_t
|
||||
|
||||
from ._sgd_fast cimport LossFunction
|
||||
from ._sgd_fast cimport Log, SquaredLoss
|
||||
|
||||
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
|
||||
|
||||
from libc.stdio cimport printf
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef inline {{c_type}} fmax{{name_suffix}}({{c_type}} x, {{c_type}} y) noexcept nogil:
|
||||
if x > y:
|
||||
return x
|
||||
return y
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef {{c_type}} _logsumexp{{name_suffix}}({{c_type}}* arr, int n_classes) noexcept nogil:
|
||||
"""Computes the sum of arr assuming arr is in the log domain.
|
||||
|
||||
Returns log(sum(exp(arr))) while minimizing the possibility of
|
||||
over/underflow.
|
||||
"""
|
||||
# Use the max to normalize, as with the log this is what accumulates
|
||||
# the less errors
|
||||
cdef {{c_type}} vmax = arr[0]
|
||||
cdef {{c_type}} out = 0.0
|
||||
cdef int i
|
||||
|
||||
for i in range(1, n_classes):
|
||||
if vmax < arr[i]:
|
||||
vmax = arr[i]
|
||||
|
||||
for i in range(n_classes):
|
||||
out += exp(arr[i] - vmax)
|
||||
|
||||
return log(out) + vmax
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef class MultinomialLogLoss{{name_suffix}}:
|
||||
cdef {{c_type}} _loss(self, {{c_type}} y, {{c_type}}* prediction, int n_classes,
|
||||
{{c_type}} sample_weight) noexcept nogil:
|
||||
r"""Multinomial Logistic regression loss.
|
||||
|
||||
The multinomial logistic loss for one sample is:
|
||||
loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
|
||||
= sw (logsumexp(prediction) - prediction[y])
|
||||
|
||||
where:
|
||||
prediction = dot(x_sample, weights) + intercept
|
||||
\delta_{y,c} = 1 if (y == c) else 0
|
||||
sw = sample_weight
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : {{c_type}}, between 0 and n_classes - 1
|
||||
Indice of the correct class for current sample (i.e. label encoded).
|
||||
|
||||
prediction : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
|
||||
Prediction of the multinomial classifier, for current sample.
|
||||
|
||||
n_classes : integer
|
||||
Total number of classes.
|
||||
|
||||
sample_weight : {{c_type}}
|
||||
Weight of current sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : {{c_type}}
|
||||
Multinomial loss for current sample.
|
||||
|
||||
Reference
|
||||
---------
|
||||
Bishop, C. M. (2006). Pattern recognition and machine learning.
|
||||
Springer. (Chapter 4.3.4)
|
||||
"""
|
||||
cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes)
|
||||
cdef {{c_type}} loss
|
||||
|
||||
# y is the indice of the correct class of current sample.
|
||||
loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
|
||||
return loss
|
||||
|
||||
cdef void dloss(self, {{c_type}} y, {{c_type}}* prediction, int n_classes,
|
||||
{{c_type}} sample_weight, {{c_type}}* gradient_ptr) noexcept nogil:
|
||||
r"""Multinomial Logistic regression gradient of the loss.
|
||||
|
||||
The gradient of the multinomial logistic loss with respect to a class c,
|
||||
and for one sample is:
|
||||
grad_c = - sw * (p[c] - \delta_{y,c})
|
||||
|
||||
where:
|
||||
p[c] = exp(logsumexp(prediction) - prediction[c])
|
||||
prediction = dot(sample, weights) + intercept
|
||||
\delta_{y,c} = 1 if (y == c) else 0
|
||||
sw = sample_weight
|
||||
|
||||
Note that to obtain the true gradient, this value has to be multiplied
|
||||
by the sample vector x.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
prediction : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
|
||||
Prediction of the multinomial classifier, for current sample.
|
||||
|
||||
y : {{c_type}}, between 0 and n_classes - 1
|
||||
Indice of the correct class for current sample (i.e. label encoded)
|
||||
|
||||
n_classes : integer
|
||||
Total number of classes.
|
||||
|
||||
sample_weight : {{c_type}}
|
||||
Weight of current sample.
|
||||
|
||||
gradient_ptr : pointer to a np.ndarray[{{c_type}}] of shape (n_classes,)
|
||||
Gradient vector to be filled.
|
||||
|
||||
Reference
|
||||
---------
|
||||
Bishop, C. M. (2006). Pattern recognition and machine learning.
|
||||
Springer. (Chapter 4.3.4)
|
||||
"""
|
||||
cdef {{c_type}} logsumexp_prediction = _logsumexp{{name_suffix}}(prediction, n_classes)
|
||||
cdef int class_ind
|
||||
|
||||
for class_ind in range(n_classes):
|
||||
gradient_ptr[class_ind] = exp(prediction[class_ind] -
|
||||
logsumexp_prediction)
|
||||
|
||||
# y is the indice of the correct class of current sample.
|
||||
if class_ind == y:
|
||||
gradient_ptr[class_ind] -= 1.0
|
||||
|
||||
gradient_ptr[class_ind] *= sample_weight
|
||||
|
||||
def __reduce__(self):
|
||||
return MultinomialLogLoss{{name_suffix}}, ()
|
||||
|
||||
{{endfor}}
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef inline {{c_type}} _soft_thresholding{{name_suffix}}({{c_type}} x, {{c_type}} shrinkage) noexcept nogil:
|
||||
return fmax{{name_suffix}}(x - shrinkage, 0) - fmax{{name_suffix}}(- x - shrinkage, 0)
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
def sag{{name_suffix}}(
|
||||
SequentialDataset{{name_suffix}} dataset,
|
||||
{{c_type}}[:, ::1] weights_array,
|
||||
{{c_type}}[::1] intercept_array,
|
||||
int n_samples,
|
||||
int n_features,
|
||||
int n_classes,
|
||||
double tol,
|
||||
int max_iter,
|
||||
str loss_function,
|
||||
double step_size,
|
||||
double alpha,
|
||||
double beta,
|
||||
{{c_type}}[:, ::1] sum_gradient_init,
|
||||
{{c_type}}[:, ::1] gradient_memory_init,
|
||||
bint[::1] seen_init,
|
||||
int num_seen,
|
||||
bint fit_intercept,
|
||||
{{c_type}}[::1] intercept_sum_gradient_init,
|
||||
double intercept_decay,
|
||||
bint saga,
|
||||
bint verbose
|
||||
):
|
||||
"""Stochastic Average Gradient (SAG) and SAGA solvers.
|
||||
|
||||
Used in Ridge and LogisticRegression.
|
||||
|
||||
Some implementation details:
|
||||
|
||||
- Just-in-time (JIT) update: In SAG(A), the average-gradient update is
|
||||
collinear with the drawn sample X_i. Therefore, if the data is sparse, the
|
||||
random sample X_i will change the average gradient only on features j where
|
||||
X_ij != 0. In some cases, the average gradient on feature j might change
|
||||
only after k random samples with no change. In these cases, instead of
|
||||
applying k times the same gradient step on feature j, we apply the gradient
|
||||
step only once, scaled by k. This is called the "just-in-time update", and
|
||||
it is performed in `lagged_update{{name_suffix}}`. This function also
|
||||
applies the proximal operator after the gradient step (if L1 regularization
|
||||
is used in SAGA).
|
||||
|
||||
- Weight scale: In SAG(A), the weights are scaled down at each iteration
|
||||
due to the L2 regularization. To avoid updating all the weights at each
|
||||
iteration, the weight scale is factored out in a separate variable `wscale`
|
||||
which is only used in the JIT update. When this variable is too small, it
|
||||
is reset for numerical stability using the function
|
||||
`scale_weights{{name_suffix}}`. This reset requires applying all remaining
|
||||
JIT updates. This reset is also performed every `n_samples` iterations
|
||||
before each convergence check, so when the algorithm stops, we are sure
|
||||
that there is no remaining JIT updates.
|
||||
|
||||
Reference
|
||||
---------
|
||||
Schmidt, M., Roux, N. L., & Bach, F. (2013).
|
||||
Minimizing finite sums with the stochastic average gradient
|
||||
https://hal.inria.fr/hal-00860051/document
|
||||
(section 4.3)
|
||||
|
||||
:arxiv:`Defazio, A., Bach F. & Lacoste-Julien S. (2014).
|
||||
"SAGA: A Fast Incremental Gradient Method With Support
|
||||
for Non-Strongly Convex Composite Objectives" <1407.0202>`
|
||||
"""
|
||||
# the data pointer for x, the current sample
|
||||
cdef {{c_type}} *x_data_ptr = NULL
|
||||
# the index pointer for the column of the data
|
||||
cdef int *x_ind_ptr = NULL
|
||||
# the number of non-zero features for current sample
|
||||
cdef int xnnz = -1
|
||||
# the label value for current sample
|
||||
# the label value for current sample
|
||||
cdef {{c_type}} y
|
||||
# the sample weight
|
||||
cdef {{c_type}} sample_weight
|
||||
|
||||
# helper variable for indexes
|
||||
cdef int f_idx, s_idx, feature_ind, class_ind, j
|
||||
# the number of pass through all samples
|
||||
cdef int n_iter = 0
|
||||
# helper to track iterations through samples
|
||||
cdef int sample_itr
|
||||
# the index (row number) of the current sample
|
||||
cdef int sample_ind
|
||||
|
||||
# the maximum change in weights, used to compute stopping criteria
|
||||
cdef {{c_type}} max_change
|
||||
# a holder variable for the max weight, used to compute stopping criteria
|
||||
cdef {{c_type}} max_weight
|
||||
|
||||
# the start time of the fit
|
||||
cdef time_t start_time
|
||||
# the end time of the fit
|
||||
cdef time_t end_time
|
||||
|
||||
# precomputation since the step size does not change in this implementation
|
||||
cdef {{c_type}} wscale_update = 1.0 - step_size * alpha
|
||||
|
||||
# helper for cumulative sum
|
||||
cdef {{c_type}} cum_sum
|
||||
|
||||
# the pointer to the coef_ or weights
|
||||
cdef {{c_type}}* weights = &weights_array[0, 0]
|
||||
|
||||
# the sum of gradients for each feature
|
||||
cdef {{c_type}}* sum_gradient = &sum_gradient_init[0, 0]
|
||||
|
||||
# the previously seen gradient for each sample
|
||||
cdef {{c_type}}* gradient_memory = &gradient_memory_init[0, 0]
|
||||
|
||||
# the cumulative sums needed for JIT params
|
||||
cdef {{c_type}}[::1] cumulative_sums = np.empty(n_samples, dtype={{np_type}}, order="c")
|
||||
|
||||
# the index for the last time this feature was updated
|
||||
cdef int[::1] feature_hist = np.zeros(n_features, dtype=np.int32, order="c")
|
||||
|
||||
# the previous weights to use to compute stopping criteria
|
||||
cdef {{c_type}}[:, ::1] previous_weights_array = np.zeros((n_features, n_classes), dtype={{np_type}}, order="c")
|
||||
cdef {{c_type}}* previous_weights = &previous_weights_array[0, 0]
|
||||
|
||||
cdef {{c_type}}[::1] prediction = np.zeros(n_classes, dtype={{np_type}}, order="c")
|
||||
|
||||
cdef {{c_type}}[::1] gradient = np.zeros(n_classes, dtype={{np_type}}, order="c")
|
||||
|
||||
# Intermediate variable that need declaration since cython cannot infer when templating
|
||||
cdef {{c_type}} val
|
||||
|
||||
# Bias correction term in saga
|
||||
cdef {{c_type}} gradient_correction
|
||||
|
||||
# the scalar used for multiplying z
|
||||
cdef {{c_type}} wscale = 1.0
|
||||
|
||||
# return value (-1 if an error occurred, 0 otherwise)
|
||||
cdef int status = 0
|
||||
|
||||
# the cumulative sums for each iteration for the sparse implementation
|
||||
cumulative_sums[0] = 0.0
|
||||
|
||||
# the multipliative scale needed for JIT params
|
||||
cdef {{c_type}}[::1] cumulative_sums_prox
|
||||
cdef {{c_type}}* cumulative_sums_prox_ptr
|
||||
|
||||
cdef bint prox = beta > 0 and saga
|
||||
|
||||
# Loss function to optimize
|
||||
cdef LossFunction loss
|
||||
# Whether the loss function is multinomial
|
||||
cdef bint multinomial = False
|
||||
# Multinomial loss function
|
||||
cdef MultinomialLogLoss{{name_suffix}} multiloss
|
||||
|
||||
if loss_function == "multinomial":
|
||||
multinomial = True
|
||||
multiloss = MultinomialLogLoss{{name_suffix}}()
|
||||
elif loss_function == "log":
|
||||
loss = Log()
|
||||
elif loss_function == "squared":
|
||||
loss = SquaredLoss()
|
||||
else:
|
||||
raise ValueError("Invalid loss parameter: got %s instead of "
|
||||
"one of ('log', 'squared', 'multinomial')"
|
||||
% loss_function)
|
||||
|
||||
if prox:
|
||||
cumulative_sums_prox = np.empty(n_samples, dtype={{np_type}}, order="c")
|
||||
cumulative_sums_prox_ptr = &cumulative_sums_prox[0]
|
||||
else:
|
||||
cumulative_sums_prox = None
|
||||
cumulative_sums_prox_ptr = NULL
|
||||
|
||||
with nogil:
|
||||
start_time = time(NULL)
|
||||
for n_iter in range(max_iter):
|
||||
for sample_itr in range(n_samples):
|
||||
# extract a random sample
|
||||
sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, &y, &sample_weight)
|
||||
|
||||
# cached index for gradient_memory
|
||||
s_idx = sample_ind * n_classes
|
||||
|
||||
# update the number of samples seen and the seen array
|
||||
if seen_init[sample_ind] == 0:
|
||||
num_seen += 1
|
||||
seen_init[sample_ind] = 1
|
||||
|
||||
# make the weight updates (just-in-time gradient step, and prox operator)
|
||||
if sample_itr > 0:
|
||||
status = lagged_update{{name_suffix}}(
|
||||
weights=weights,
|
||||
wscale=wscale,
|
||||
xnnz=xnnz,
|
||||
n_samples=n_samples,
|
||||
n_classes=n_classes,
|
||||
sample_itr=sample_itr,
|
||||
cumulative_sums=&cumulative_sums[0],
|
||||
cumulative_sums_prox=cumulative_sums_prox_ptr,
|
||||
feature_hist=&feature_hist[0],
|
||||
prox=prox,
|
||||
sum_gradient=sum_gradient,
|
||||
x_ind_ptr=x_ind_ptr,
|
||||
reset=False,
|
||||
n_iter=n_iter
|
||||
)
|
||||
if status == -1:
|
||||
break
|
||||
|
||||
# find the current prediction
|
||||
predict_sample{{name_suffix}}(
|
||||
x_data_ptr=x_data_ptr,
|
||||
x_ind_ptr=x_ind_ptr,
|
||||
xnnz=xnnz,
|
||||
w_data_ptr=weights,
|
||||
wscale=wscale,
|
||||
intercept=&intercept_array[0],
|
||||
prediction=&prediction[0],
|
||||
n_classes=n_classes
|
||||
)
|
||||
|
||||
# compute the gradient for this sample, given the prediction
|
||||
if multinomial:
|
||||
multiloss.dloss(y, &prediction[0], n_classes, sample_weight, &gradient[0])
|
||||
else:
|
||||
gradient[0] = loss.dloss(y, prediction[0]) * sample_weight
|
||||
|
||||
# L2 regularization by simply rescaling the weights
|
||||
wscale *= wscale_update
|
||||
|
||||
# make the updates to the sum of gradients
|
||||
for j in range(xnnz):
|
||||
feature_ind = x_ind_ptr[j]
|
||||
val = x_data_ptr[j]
|
||||
f_idx = feature_ind * n_classes
|
||||
for class_ind in range(n_classes):
|
||||
gradient_correction = \
|
||||
val * (gradient[class_ind] -
|
||||
gradient_memory[s_idx + class_ind])
|
||||
if saga:
|
||||
# Note that this is not the main gradient step,
|
||||
# which is performed just-in-time in lagged_update.
|
||||
# This part is done outside the JIT update
|
||||
# as it does not depend on the average gradient.
|
||||
# The prox operator is applied after the JIT update
|
||||
weights[f_idx + class_ind] -= \
|
||||
(gradient_correction * step_size
|
||||
* (1 - 1. / num_seen) / wscale)
|
||||
sum_gradient[f_idx + class_ind] += gradient_correction
|
||||
|
||||
# fit the intercept
|
||||
if fit_intercept:
|
||||
for class_ind in range(n_classes):
|
||||
gradient_correction = (gradient[class_ind] -
|
||||
gradient_memory[s_idx + class_ind])
|
||||
intercept_sum_gradient_init[class_ind] += gradient_correction
|
||||
gradient_correction *= step_size * (1. - 1. / num_seen)
|
||||
if saga:
|
||||
intercept_array[class_ind] -= \
|
||||
(step_size * intercept_sum_gradient_init[class_ind] /
|
||||
num_seen * intercept_decay) + gradient_correction
|
||||
else:
|
||||
intercept_array[class_ind] -= \
|
||||
(step_size * intercept_sum_gradient_init[class_ind] /
|
||||
num_seen * intercept_decay)
|
||||
|
||||
# check to see that the intercept is not inf or NaN
|
||||
if not isfinite(intercept_array[class_ind]):
|
||||
status = -1
|
||||
break
|
||||
# Break from the n_samples outer loop if an error happened
|
||||
# in the fit_intercept n_classes inner loop
|
||||
if status == -1:
|
||||
break
|
||||
|
||||
# update the gradient memory for this sample
|
||||
for class_ind in range(n_classes):
|
||||
gradient_memory[s_idx + class_ind] = gradient[class_ind]
|
||||
|
||||
if sample_itr == 0:
|
||||
cumulative_sums[0] = step_size / (wscale * num_seen)
|
||||
if prox:
|
||||
cumulative_sums_prox[0] = step_size * beta / wscale
|
||||
else:
|
||||
cumulative_sums[sample_itr] = \
|
||||
(cumulative_sums[sample_itr - 1] +
|
||||
step_size / (wscale * num_seen))
|
||||
if prox:
|
||||
cumulative_sums_prox[sample_itr] = \
|
||||
(cumulative_sums_prox[sample_itr - 1] +
|
||||
step_size * beta / wscale)
|
||||
# If wscale gets too small, we need to reset the scale.
|
||||
# This also resets the just-in-time update system.
|
||||
if wscale < 1e-9:
|
||||
if verbose:
|
||||
with gil:
|
||||
print("rescaling...")
|
||||
status = scale_weights{{name_suffix}}(
|
||||
weights=weights,
|
||||
wscale=&wscale,
|
||||
n_features=n_features,
|
||||
n_samples=n_samples,
|
||||
n_classes=n_classes,
|
||||
sample_itr=sample_itr,
|
||||
cumulative_sums=&cumulative_sums[0],
|
||||
cumulative_sums_prox=cumulative_sums_prox_ptr,
|
||||
feature_hist=&feature_hist[0],
|
||||
prox=prox,
|
||||
sum_gradient=sum_gradient,
|
||||
n_iter=n_iter
|
||||
)
|
||||
if status == -1:
|
||||
break
|
||||
|
||||
# Break from the n_iter outer loop if an error happened in the
|
||||
# n_samples inner loop
|
||||
if status == -1:
|
||||
break
|
||||
|
||||
# We scale the weights every n_samples iterations and reset the
|
||||
# just-in-time update system for numerical stability.
|
||||
# Because this reset is done before every convergence check, we are
|
||||
# sure there is no remaining lagged update when the algorithm stops.
|
||||
status = scale_weights{{name_suffix}}(
|
||||
weights=weights,
|
||||
wscale=&wscale,
|
||||
n_features=n_features,
|
||||
n_samples=n_samples,
|
||||
n_classes=n_classes,
|
||||
sample_itr=n_samples - 1,
|
||||
cumulative_sums=&cumulative_sums[0],
|
||||
cumulative_sums_prox=cumulative_sums_prox_ptr,
|
||||
feature_hist=&feature_hist[0],
|
||||
prox=prox,
|
||||
sum_gradient=sum_gradient,
|
||||
n_iter=n_iter
|
||||
)
|
||||
if status == -1:
|
||||
break
|
||||
|
||||
# check if the stopping criteria is reached
|
||||
max_change = 0.0
|
||||
max_weight = 0.0
|
||||
for idx in range(n_features * n_classes):
|
||||
max_weight = fmax{{name_suffix}}(max_weight, fabs(weights[idx]))
|
||||
max_change = fmax{{name_suffix}}(max_change, fabs(weights[idx] - previous_weights[idx]))
|
||||
previous_weights[idx] = weights[idx]
|
||||
if ((max_weight != 0 and max_change / max_weight <= tol)
|
||||
or max_weight == 0 and max_change == 0):
|
||||
if verbose:
|
||||
end_time = time(NULL)
|
||||
with gil:
|
||||
print("convergence after %d epochs took %d seconds" %
|
||||
(n_iter + 1, end_time - start_time))
|
||||
break
|
||||
elif verbose:
|
||||
printf('Epoch %d, change: %.8f\n', n_iter + 1,
|
||||
max_change / max_weight)
|
||||
n_iter += 1
|
||||
# We do the error treatment here based on error code in status to avoid
|
||||
# re-acquiring the GIL within the cython code, which slows the computation
|
||||
# when the sag/saga solver is used concurrently in multiple Python threads.
|
||||
if status == -1:
|
||||
raise ValueError(("Floating-point under-/overflow occurred at epoch"
|
||||
" #%d. Scaling input data with StandardScaler or"
|
||||
" MinMaxScaler might help.") % n_iter)
|
||||
|
||||
if verbose and n_iter >= max_iter:
|
||||
end_time = time(NULL)
|
||||
print(("max_iter reached after %d seconds") %
|
||||
(end_time - start_time))
|
||||
|
||||
return num_seen, n_iter
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef int scale_weights{{name_suffix}}(
|
||||
{{c_type}}* weights,
|
||||
{{c_type}}* wscale,
|
||||
int n_features,
|
||||
int n_samples,
|
||||
int n_classes,
|
||||
int sample_itr,
|
||||
{{c_type}}* cumulative_sums,
|
||||
{{c_type}}* cumulative_sums_prox,
|
||||
int* feature_hist,
|
||||
bint prox,
|
||||
{{c_type}}* sum_gradient,
|
||||
int n_iter
|
||||
) noexcept nogil:
|
||||
"""Scale the weights and reset wscale to 1.0 for numerical stability, and
|
||||
reset the just-in-time (JIT) update system.
|
||||
|
||||
See `sag{{name_suffix}}`'s docstring about the JIT update system.
|
||||
|
||||
wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
|
||||
can become very small, so we reset it every n_samples iterations to 1.0 for
|
||||
numerical stability. To be able to scale, we first need to update every
|
||||
coefficients and reset the just-in-time update system.
|
||||
This also limits the size of `cumulative_sums`.
|
||||
"""
|
||||
|
||||
cdef int status
|
||||
status = lagged_update{{name_suffix}}(
|
||||
weights,
|
||||
wscale[0],
|
||||
n_features,
|
||||
n_samples,
|
||||
n_classes,
|
||||
sample_itr + 1,
|
||||
cumulative_sums,
|
||||
cumulative_sums_prox,
|
||||
feature_hist,
|
||||
prox,
|
||||
sum_gradient,
|
||||
NULL,
|
||||
True,
|
||||
n_iter
|
||||
)
|
||||
# if lagged update succeeded, reset wscale to 1.0
|
||||
if status == 0:
|
||||
wscale[0] = 1.0
|
||||
return status
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef int lagged_update{{name_suffix}}(
|
||||
{{c_type}}* weights,
|
||||
{{c_type}} wscale,
|
||||
int xnnz,
|
||||
int n_samples,
|
||||
int n_classes,
|
||||
int sample_itr,
|
||||
{{c_type}}* cumulative_sums,
|
||||
{{c_type}}* cumulative_sums_prox,
|
||||
int* feature_hist,
|
||||
bint prox,
|
||||
{{c_type}}* sum_gradient,
|
||||
int* x_ind_ptr,
|
||||
bint reset,
|
||||
int n_iter
|
||||
) noexcept nogil:
|
||||
"""Hard perform the JIT updates for non-zero features of present sample.
|
||||
|
||||
See `sag{{name_suffix}}`'s docstring about the JIT update system.
|
||||
|
||||
The updates that awaits are kept in memory using cumulative_sums,
|
||||
cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
|
||||
(Defazio et al. 2014) for details. If reset=True, we also reset wscale to
|
||||
1 (this is done at the end of each epoch).
|
||||
"""
|
||||
cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
|
||||
cdef {{c_type}} cum_sum, grad_step, prox_step, cum_sum_prox
|
||||
for feature_ind in range(xnnz):
|
||||
if not reset:
|
||||
feature_ind = x_ind_ptr[feature_ind]
|
||||
f_idx = feature_ind * n_classes
|
||||
|
||||
cum_sum = cumulative_sums[sample_itr - 1]
|
||||
if prox:
|
||||
cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
|
||||
if feature_hist[feature_ind] != 0:
|
||||
cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
|
||||
if prox:
|
||||
cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
|
||||
if not prox:
|
||||
for class_ind in range(n_classes):
|
||||
idx = f_idx + class_ind
|
||||
weights[idx] -= cum_sum * sum_gradient[idx]
|
||||
if reset:
|
||||
weights[idx] *= wscale
|
||||
if not isfinite(weights[idx]):
|
||||
# returning here does not require the gil as the return
|
||||
# type is a C integer
|
||||
return -1
|
||||
else:
|
||||
for class_ind in range(n_classes):
|
||||
idx = f_idx + class_ind
|
||||
if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
|
||||
# In this case, we can perform all the gradient steps and
|
||||
# all the proximal steps in this order, which is more
|
||||
# efficient than unrolling all the lagged updates.
|
||||
# Idea taken from scikit-learn-contrib/lightning.
|
||||
weights[idx] -= cum_sum * sum_gradient[idx]
|
||||
weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
|
||||
cum_sum_prox)
|
||||
else:
|
||||
last_update_ind = feature_hist[feature_ind]
|
||||
if last_update_ind == -1:
|
||||
last_update_ind = sample_itr - 1
|
||||
for lagged_ind in range(sample_itr - 1,
|
||||
last_update_ind - 1, -1):
|
||||
if lagged_ind > 0:
|
||||
grad_step = (cumulative_sums[lagged_ind]
|
||||
- cumulative_sums[lagged_ind - 1])
|
||||
prox_step = (cumulative_sums_prox[lagged_ind]
|
||||
- cumulative_sums_prox[lagged_ind - 1])
|
||||
else:
|
||||
grad_step = cumulative_sums[lagged_ind]
|
||||
prox_step = cumulative_sums_prox[lagged_ind]
|
||||
weights[idx] -= sum_gradient[idx] * grad_step
|
||||
weights[idx] = _soft_thresholding{{name_suffix}}(weights[idx],
|
||||
prox_step)
|
||||
|
||||
if reset:
|
||||
weights[idx] *= wscale
|
||||
# check to see that the weight is not inf or NaN
|
||||
if not isfinite(weights[idx]):
|
||||
return -1
|
||||
if reset:
|
||||
feature_hist[feature_ind] = sample_itr % n_samples
|
||||
else:
|
||||
feature_hist[feature_ind] = sample_itr
|
||||
|
||||
if reset:
|
||||
cumulative_sums[sample_itr - 1] = 0.0
|
||||
if prox:
|
||||
cumulative_sums_prox[sample_itr - 1] = 0.0
|
||||
|
||||
return 0
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef void predict_sample{{name_suffix}}(
|
||||
{{c_type}}* x_data_ptr,
|
||||
int* x_ind_ptr,
|
||||
int xnnz,
|
||||
{{c_type}}* w_data_ptr,
|
||||
{{c_type}} wscale,
|
||||
{{c_type}}* intercept,
|
||||
{{c_type}}* prediction,
|
||||
int n_classes
|
||||
) noexcept nogil:
|
||||
"""Compute the prediction given sparse sample x and dense weight w.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x_data_ptr : pointer
|
||||
Pointer to the data of the sample x
|
||||
|
||||
x_ind_ptr : pointer
|
||||
Pointer to the indices of the sample x
|
||||
|
||||
xnnz : int
|
||||
Number of non-zero element in the sample x
|
||||
|
||||
w_data_ptr : pointer
|
||||
Pointer to the data of the weights w
|
||||
|
||||
wscale : {{c_type}}
|
||||
Scale of the weights w
|
||||
|
||||
intercept : pointer
|
||||
Pointer to the intercept
|
||||
|
||||
prediction : pointer
|
||||
Pointer to store the resulting prediction
|
||||
|
||||
n_classes : int
|
||||
Number of classes in multinomial case. Equals 1 in binary case.
|
||||
|
||||
"""
|
||||
cdef int feature_ind, class_ind, j
|
||||
cdef {{c_type}} innerprod
|
||||
|
||||
for class_ind in range(n_classes):
|
||||
innerprod = 0.0
|
||||
# Compute the dot product only on non-zero elements of x
|
||||
for j in range(xnnz):
|
||||
feature_ind = x_ind_ptr[j]
|
||||
innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
|
||||
x_data_ptr[j])
|
||||
|
||||
prediction[class_ind] = wscale * innerprod + intercept[class_ind]
|
||||
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
def _multinomial_grad_loss_all_samples(
|
||||
SequentialDataset64 dataset,
|
||||
double[:, ::1] weights_array,
|
||||
double[::1] intercept_array,
|
||||
int n_samples,
|
||||
int n_features,
|
||||
int n_classes
|
||||
):
|
||||
"""Compute multinomial gradient and loss across all samples.
|
||||
|
||||
Used for testing purpose only.
|
||||
"""
|
||||
cdef double *x_data_ptr = NULL
|
||||
cdef int *x_ind_ptr = NULL
|
||||
cdef int xnnz = -1
|
||||
cdef double y
|
||||
cdef double sample_weight
|
||||
|
||||
cdef double wscale = 1.0
|
||||
cdef int i, j, class_ind, feature_ind
|
||||
cdef double val
|
||||
cdef double sum_loss = 0.0
|
||||
|
||||
cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64()
|
||||
|
||||
cdef double[:, ::1] sum_gradient_array = np.zeros((n_features, n_classes), dtype=np.double, order="c")
|
||||
cdef double* sum_gradient = &sum_gradient_array[0, 0]
|
||||
|
||||
cdef double[::1] prediction = np.zeros(n_classes, dtype=np.double, order="c")
|
||||
|
||||
cdef double[::1] gradient = np.zeros(n_classes, dtype=np.double, order="c")
|
||||
|
||||
with nogil:
|
||||
for i in range(n_samples):
|
||||
# get next sample on the dataset
|
||||
dataset.next(
|
||||
&x_data_ptr,
|
||||
&x_ind_ptr,
|
||||
&xnnz,
|
||||
&y,
|
||||
&sample_weight
|
||||
)
|
||||
|
||||
# prediction of the multinomial classifier for the sample
|
||||
predict_sample64(
|
||||
x_data_ptr,
|
||||
x_ind_ptr,
|
||||
xnnz,
|
||||
&weights_array[0, 0],
|
||||
wscale,
|
||||
&intercept_array[0],
|
||||
&prediction[0],
|
||||
n_classes
|
||||
)
|
||||
|
||||
# compute the gradient for this sample, given the prediction
|
||||
multiloss.dloss(y, &prediction[0], n_classes, sample_weight, &gradient[0])
|
||||
|
||||
# compute the loss for this sample, given the prediction
|
||||
sum_loss += multiloss._loss(y, &prediction[0], n_classes, sample_weight)
|
||||
|
||||
# update the sum of the gradient
|
||||
for j in range(xnnz):
|
||||
feature_ind = x_ind_ptr[j]
|
||||
val = x_data_ptr[j]
|
||||
for class_ind in range(n_classes):
|
||||
sum_gradient[feature_ind * n_classes + class_ind] += gradient[class_ind] * val
|
||||
|
||||
return sum_loss, sum_gradient_array
|
||||
Binary file not shown.
@@ -0,0 +1,26 @@
|
||||
# License: BSD 3 clause
|
||||
"""Helper to load LossFunction from sgd_fast.pyx to sag_fast.pyx"""
|
||||
|
||||
cdef class LossFunction:
|
||||
cdef double loss(self, double y, double p) noexcept nogil
|
||||
cdef double dloss(self, double y, double p) noexcept nogil
|
||||
|
||||
|
||||
cdef class Regression(LossFunction):
|
||||
cdef double loss(self, double y, double p) noexcept nogil
|
||||
cdef double dloss(self, double y, double p) noexcept nogil
|
||||
|
||||
|
||||
cdef class Classification(LossFunction):
|
||||
cdef double loss(self, double y, double p) noexcept nogil
|
||||
cdef double dloss(self, double y, double p) noexcept nogil
|
||||
|
||||
|
||||
cdef class Log(Classification):
|
||||
cdef double loss(self, double y, double p) noexcept nogil
|
||||
cdef double dloss(self, double y, double p) noexcept nogil
|
||||
|
||||
|
||||
cdef class SquaredLoss(Regression):
|
||||
cdef double loss(self, double y, double p) noexcept nogil
|
||||
cdef double dloss(self, double y, double p) noexcept nogil
|
||||
@@ -0,0 +1,780 @@
|
||||
{{py:
|
||||
|
||||
"""
|
||||
Template file to easily generate fused types consistent code using Tempita
|
||||
(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
|
||||
|
||||
Generated file: _sgd_fast.pyx
|
||||
|
||||
Each relevant function is duplicated for the dtypes float and double.
|
||||
The keywords between double braces are substituted in setup.py.
|
||||
|
||||
Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
Mathieu Blondel (partial_fit support)
|
||||
Rob Zinkov (passive-aggressive)
|
||||
Lars Buitinck
|
||||
|
||||
License: BSD 3 clause
|
||||
"""
|
||||
|
||||
# The dtypes are defined as follows (name_suffix, c_type, np_type)
|
||||
dtypes = [
|
||||
("64", "double", "np.float64"),
|
||||
("32", "float", "np.float32"),
|
||||
]
|
||||
|
||||
}}
|
||||
"""SGD implementation"""
|
||||
|
||||
import numpy as np
|
||||
from time import time
|
||||
|
||||
from cython cimport floating
|
||||
from libc.math cimport exp, fabs, isfinite, log, pow, INFINITY
|
||||
|
||||
from ..utils._typedefs cimport uint32_t
|
||||
from ..utils._weight_vector cimport WeightVector32, WeightVector64
|
||||
from ..utils._seq_dataset cimport SequentialDataset32, SequentialDataset64
|
||||
|
||||
|
||||
cdef extern from *:
|
||||
"""
|
||||
/* Penalty constants */
|
||||
#define NO_PENALTY 0
|
||||
#define L1 1
|
||||
#define L2 2
|
||||
#define ELASTICNET 3
|
||||
|
||||
/* Learning rate constants */
|
||||
#define CONSTANT 1
|
||||
#define OPTIMAL 2
|
||||
#define INVSCALING 3
|
||||
#define ADAPTIVE 4
|
||||
#define PA1 5
|
||||
#define PA2 6
|
||||
"""
|
||||
int NO_PENALTY = 0
|
||||
int L1 = 1
|
||||
int L2 = 2
|
||||
int ELASTICNET = 3
|
||||
|
||||
int CONSTANT = 1
|
||||
int OPTIMAL = 2
|
||||
int INVSCALING = 3
|
||||
int ADAPTIVE = 4
|
||||
int PA1 = 5
|
||||
int PA2 = 6
|
||||
|
||||
|
||||
# ----------------------------------------
|
||||
# Extension Types for Loss Functions
|
||||
# ----------------------------------------
|
||||
|
||||
cdef class LossFunction:
|
||||
"""Base class for convex loss functions"""
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
"""Evaluate the loss function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : double
|
||||
The true value (aka target).
|
||||
p : double
|
||||
The prediction, `p = w^T x + intercept`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
double
|
||||
The loss evaluated at `p` and `y`.
|
||||
"""
|
||||
return 0.
|
||||
|
||||
def py_dloss(self, double p, double y):
|
||||
"""Python version of `dloss` for testing.
|
||||
|
||||
Pytest needs a python function and can't use cdef functions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
p : double
|
||||
The prediction, `p = w^T x`.
|
||||
y : double
|
||||
The true value (aka target).
|
||||
|
||||
Returns
|
||||
-------
|
||||
double
|
||||
The derivative of the loss function with regards to `p`.
|
||||
"""
|
||||
return self.dloss(y, p)
|
||||
|
||||
def py_loss(self, double p, double y):
|
||||
"""Python version of `loss` for testing.
|
||||
|
||||
Pytest needs a python function and can't use cdef functions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
p : double
|
||||
The prediction, `p = w^T x + intercept`.
|
||||
y : double
|
||||
The true value (aka target).
|
||||
|
||||
Returns
|
||||
-------
|
||||
double
|
||||
The loss evaluated at `p` and `y`.
|
||||
"""
|
||||
return self.loss(y, p)
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
"""Evaluate the derivative of the loss function with respect to
|
||||
the prediction `p`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : double
|
||||
The true value (aka target).
|
||||
p : double
|
||||
The prediction, `p = w^T x`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
double
|
||||
The derivative of the loss function with regards to `p`.
|
||||
"""
|
||||
return 0.
|
||||
|
||||
|
||||
cdef class Regression(LossFunction):
|
||||
"""Base class for loss functions for regression"""
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
return 0.
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
return 0.
|
||||
|
||||
|
||||
cdef class Classification(LossFunction):
|
||||
"""Base class for loss functions for classification"""
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
return 0.
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
return 0.
|
||||
|
||||
|
||||
cdef class ModifiedHuber(Classification):
|
||||
"""Modified Huber loss for binary classification with y in {-1, 1}
|
||||
|
||||
This is equivalent to quadratically smoothed SVM with gamma = 2.
|
||||
|
||||
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
|
||||
Stochastic Gradient Descent', ICML'04.
|
||||
"""
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
if z >= 1.0:
|
||||
return 0.0
|
||||
elif z >= -1.0:
|
||||
return (1.0 - z) * (1.0 - z)
|
||||
else:
|
||||
return -4.0 * z
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
if z >= 1.0:
|
||||
return 0.0
|
||||
elif z >= -1.0:
|
||||
return 2.0 * (1.0 - z) * -y
|
||||
else:
|
||||
return -4.0 * y
|
||||
|
||||
def __reduce__(self):
|
||||
return ModifiedHuber, ()
|
||||
|
||||
|
||||
cdef class Hinge(Classification):
|
||||
"""Hinge loss for binary classification tasks with y in {-1,1}
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
threshold : float > 0.0
|
||||
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
|
||||
When threshold=0.0, one gets the loss used by the Perceptron.
|
||||
"""
|
||||
|
||||
cdef double threshold
|
||||
|
||||
def __init__(self, double threshold=1.0):
|
||||
self.threshold = threshold
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
if z <= self.threshold:
|
||||
return self.threshold - z
|
||||
return 0.0
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
if z <= self.threshold:
|
||||
return -y
|
||||
return 0.0
|
||||
|
||||
def __reduce__(self):
|
||||
return Hinge, (self.threshold,)
|
||||
|
||||
|
||||
cdef class SquaredHinge(Classification):
|
||||
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
threshold : float > 0.0
|
||||
Margin threshold. When threshold=1.0, one gets the loss used by
|
||||
(quadratically penalized) SVM.
|
||||
"""
|
||||
|
||||
cdef double threshold
|
||||
|
||||
def __init__(self, double threshold=1.0):
|
||||
self.threshold = threshold
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = self.threshold - p * y
|
||||
if z > 0:
|
||||
return z * z
|
||||
return 0.0
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = self.threshold - p * y
|
||||
if z > 0:
|
||||
return -2 * y * z
|
||||
return 0.0
|
||||
|
||||
def __reduce__(self):
|
||||
return SquaredHinge, (self.threshold,)
|
||||
|
||||
|
||||
cdef class Log(Classification):
|
||||
"""Logistic regression loss for binary classification with y in {-1, 1}"""
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
# approximately equal and saves the computation of the log
|
||||
if z > 18:
|
||||
return exp(-z)
|
||||
if z < -18:
|
||||
return -z
|
||||
return log(1.0 + exp(-z))
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double z = p * y
|
||||
# approximately equal and saves the computation of the log
|
||||
if z > 18.0:
|
||||
return exp(-z) * -y
|
||||
if z < -18.0:
|
||||
return -y
|
||||
return -y / (exp(z) + 1.0)
|
||||
|
||||
def __reduce__(self):
|
||||
return Log, ()
|
||||
|
||||
|
||||
cdef class SquaredLoss(Regression):
|
||||
"""Squared loss traditional used in linear regression."""
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
return 0.5 * (p - y) * (p - y)
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
return p - y
|
||||
|
||||
def __reduce__(self):
|
||||
return SquaredLoss, ()
|
||||
|
||||
|
||||
cdef class Huber(Regression):
|
||||
"""Huber regression loss
|
||||
|
||||
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
|
||||
linear in for large errors).
|
||||
|
||||
https://en.wikipedia.org/wiki/Huber_Loss_Function
|
||||
"""
|
||||
|
||||
cdef double c
|
||||
|
||||
def __init__(self, double c):
|
||||
self.c = c
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double r = p - y
|
||||
cdef double abs_r = fabs(r)
|
||||
if abs_r <= self.c:
|
||||
return 0.5 * r * r
|
||||
else:
|
||||
return self.c * abs_r - (0.5 * self.c * self.c)
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double r = p - y
|
||||
cdef double abs_r = fabs(r)
|
||||
if abs_r <= self.c:
|
||||
return r
|
||||
elif r > 0.0:
|
||||
return self.c
|
||||
else:
|
||||
return -self.c
|
||||
|
||||
def __reduce__(self):
|
||||
return Huber, (self.c,)
|
||||
|
||||
|
||||
cdef class EpsilonInsensitive(Regression):
|
||||
"""Epsilon-Insensitive loss (used by SVR).
|
||||
|
||||
loss = max(0, |y - p| - epsilon)
|
||||
"""
|
||||
|
||||
cdef double epsilon
|
||||
|
||||
def __init__(self, double epsilon):
|
||||
self.epsilon = epsilon
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double ret = fabs(y - p) - self.epsilon
|
||||
return ret if ret > 0 else 0
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
if y - p > self.epsilon:
|
||||
return -1
|
||||
elif p - y > self.epsilon:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def __reduce__(self):
|
||||
return EpsilonInsensitive, (self.epsilon,)
|
||||
|
||||
|
||||
cdef class SquaredEpsilonInsensitive(Regression):
|
||||
"""Epsilon-Insensitive loss.
|
||||
|
||||
loss = max(0, |y - p| - epsilon)^2
|
||||
"""
|
||||
|
||||
cdef double epsilon
|
||||
|
||||
def __init__(self, double epsilon):
|
||||
self.epsilon = epsilon
|
||||
|
||||
cdef double loss(self, double y, double p) noexcept nogil:
|
||||
cdef double ret = fabs(y - p) - self.epsilon
|
||||
return ret * ret if ret > 0 else 0
|
||||
|
||||
cdef double dloss(self, double y, double p) noexcept nogil:
|
||||
cdef double z
|
||||
z = y - p
|
||||
if z > self.epsilon:
|
||||
return -2 * (z - self.epsilon)
|
||||
elif z < -self.epsilon:
|
||||
return 2 * (-z - self.epsilon)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def __reduce__(self):
|
||||
return SquaredEpsilonInsensitive, (self.epsilon,)
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
def _plain_sgd{{name_suffix}}(
|
||||
const {{c_type}}[::1] weights,
|
||||
double intercept,
|
||||
const {{c_type}}[::1] average_weights,
|
||||
double average_intercept,
|
||||
LossFunction loss,
|
||||
int penalty_type,
|
||||
double alpha,
|
||||
double C,
|
||||
double l1_ratio,
|
||||
SequentialDataset{{name_suffix}} dataset,
|
||||
const unsigned char[::1] validation_mask,
|
||||
bint early_stopping,
|
||||
validation_score_cb,
|
||||
int n_iter_no_change,
|
||||
unsigned int max_iter,
|
||||
double tol,
|
||||
int fit_intercept,
|
||||
int verbose,
|
||||
bint shuffle,
|
||||
uint32_t seed,
|
||||
double weight_pos,
|
||||
double weight_neg,
|
||||
int learning_rate,
|
||||
double eta0,
|
||||
double power_t,
|
||||
bint one_class,
|
||||
double t=1.0,
|
||||
double intercept_decay=1.0,
|
||||
int average=0,
|
||||
):
|
||||
"""SGD for generic loss functions and penalties with optional averaging
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weights : ndarray[{{c_type}}, ndim=1]
|
||||
The allocated vector of weights.
|
||||
intercept : double
|
||||
The initial intercept.
|
||||
average_weights : ndarray[{{c_type}}, ndim=1]
|
||||
The average weights as computed for ASGD. Should be None if average
|
||||
is 0.
|
||||
average_intercept : double
|
||||
The average intercept for ASGD. Should be 0 if average is 0.
|
||||
loss : LossFunction
|
||||
A concrete ``LossFunction`` object.
|
||||
penalty_type : int
|
||||
The penalty 2 for L2, 1 for L1, and 3 for Elastic-Net.
|
||||
alpha : float
|
||||
The regularization parameter.
|
||||
C : float
|
||||
Maximum step size for passive aggressive.
|
||||
l1_ratio : float
|
||||
The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
|
||||
l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
|
||||
dataset : SequentialDataset
|
||||
A concrete ``SequentialDataset`` object.
|
||||
validation_mask : ndarray[unsigned char, ndim=1]
|
||||
Equal to True on the validation set.
|
||||
early_stopping : boolean
|
||||
Whether to use a stopping criterion based on the validation set.
|
||||
validation_score_cb : callable
|
||||
A callable to compute a validation score given the current
|
||||
coefficients and intercept values.
|
||||
Used only if early_stopping is True.
|
||||
n_iter_no_change : int
|
||||
Number of iteration with no improvement to wait before stopping.
|
||||
max_iter : int
|
||||
The maximum number of iterations (epochs).
|
||||
tol: double
|
||||
The tolerance for the stopping criterion.
|
||||
fit_intercept : int
|
||||
Whether or not to fit the intercept (1 or 0).
|
||||
verbose : int
|
||||
Print verbose output; 0 for quite.
|
||||
shuffle : boolean
|
||||
Whether to shuffle the training data before each epoch.
|
||||
weight_pos : float
|
||||
The weight of the positive class.
|
||||
weight_neg : float
|
||||
The weight of the negative class.
|
||||
seed : uint32_t
|
||||
Seed of the pseudorandom number generator used to shuffle the data.
|
||||
learning_rate : int
|
||||
The learning rate:
|
||||
(1) constant, eta = eta0
|
||||
(2) optimal, eta = 1.0/(alpha * t).
|
||||
(3) inverse scaling, eta = eta0 / pow(t, power_t)
|
||||
(4) adaptive decrease
|
||||
(5) Passive Aggressive-I, eta = min(alpha, loss/norm(x))
|
||||
(6) Passive Aggressive-II, eta = 1.0 / (norm(x) + 0.5*alpha)
|
||||
eta0 : double
|
||||
The initial learning rate.
|
||||
power_t : double
|
||||
The exponent for inverse scaling learning rate.
|
||||
one_class : boolean
|
||||
Whether to solve the One-Class SVM optimization problem.
|
||||
t : double
|
||||
Initial state of the learning rate. This value is equal to the
|
||||
iteration count except when the learning rate is set to `optimal`.
|
||||
Default: 1.0.
|
||||
average : int
|
||||
The number of iterations before averaging starts. average=1 is
|
||||
equivalent to averaging for all iterations.
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
weights : array, shape=[n_features]
|
||||
The fitted weight vector.
|
||||
intercept : float
|
||||
The fitted intercept term.
|
||||
average_weights : array shape=[n_features]
|
||||
The averaged weights across iterations. Values are valid only if
|
||||
average > 0.
|
||||
average_intercept : float
|
||||
The averaged intercept across iterations.
|
||||
Values are valid only if average > 0.
|
||||
n_iter_ : int
|
||||
The actual number of iter (epochs).
|
||||
"""
|
||||
|
||||
# get the data information into easy vars
|
||||
cdef Py_ssize_t n_samples = dataset.n_samples
|
||||
cdef Py_ssize_t n_features = weights.shape[0]
|
||||
|
||||
cdef WeightVector{{name_suffix}} w = WeightVector{{name_suffix}}(weights, average_weights)
|
||||
cdef {{c_type}} *x_data_ptr = NULL
|
||||
cdef int *x_ind_ptr = NULL
|
||||
|
||||
# helper variables
|
||||
cdef int no_improvement_count = 0
|
||||
cdef bint infinity = False
|
||||
cdef int xnnz
|
||||
cdef double eta = 0.0
|
||||
cdef double p = 0.0
|
||||
cdef double update = 0.0
|
||||
cdef double intercept_update = 0.0
|
||||
cdef double sumloss = 0.0
|
||||
cdef double score = 0.0
|
||||
cdef double best_loss = INFINITY
|
||||
cdef double best_score = -INFINITY
|
||||
cdef {{c_type}} y = 0.0
|
||||
cdef {{c_type}} sample_weight
|
||||
cdef {{c_type}} class_weight = 1.0
|
||||
cdef unsigned int count = 0
|
||||
cdef unsigned int train_count = n_samples - np.sum(validation_mask)
|
||||
cdef unsigned int epoch = 0
|
||||
cdef unsigned int i = 0
|
||||
cdef int is_hinge = isinstance(loss, Hinge)
|
||||
cdef double optimal_init = 0.0
|
||||
cdef double dloss = 0.0
|
||||
cdef double MAX_DLOSS = 1e12
|
||||
|
||||
cdef long long sample_index
|
||||
|
||||
# q vector is only used for L1 regularization
|
||||
cdef {{c_type}}[::1] q = None
|
||||
cdef {{c_type}} * q_data_ptr = NULL
|
||||
if penalty_type == L1 or penalty_type == ELASTICNET:
|
||||
q = np.zeros((n_features,), dtype={{np_type}}, order="c")
|
||||
q_data_ptr = &q[0]
|
||||
cdef double u = 0.0
|
||||
|
||||
if penalty_type == L2:
|
||||
l1_ratio = 0.0
|
||||
elif penalty_type == L1:
|
||||
l1_ratio = 1.0
|
||||
|
||||
eta = eta0
|
||||
|
||||
if learning_rate == OPTIMAL:
|
||||
typw = np.sqrt(1.0 / np.sqrt(alpha))
|
||||
# computing eta0, the initial learning rate
|
||||
initial_eta0 = typw / max(1.0, loss.dloss(1.0, -typw))
|
||||
# initialize t such that eta at first sample equals eta0
|
||||
optimal_init = 1.0 / (initial_eta0 * alpha)
|
||||
|
||||
t_start = time()
|
||||
with nogil:
|
||||
for epoch in range(max_iter):
|
||||
sumloss = 0
|
||||
if verbose > 0:
|
||||
with gil:
|
||||
print("-- Epoch %d" % (epoch + 1))
|
||||
if shuffle:
|
||||
dataset.shuffle(seed)
|
||||
for i in range(n_samples):
|
||||
dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
|
||||
&y, &sample_weight)
|
||||
|
||||
sample_index = dataset.index_data_ptr[dataset.current_index]
|
||||
if validation_mask[sample_index]:
|
||||
# do not learn on the validation set
|
||||
continue
|
||||
|
||||
p = w.dot(x_data_ptr, x_ind_ptr, xnnz) + intercept
|
||||
if learning_rate == OPTIMAL:
|
||||
eta = 1.0 / (alpha * (optimal_init + t - 1))
|
||||
elif learning_rate == INVSCALING:
|
||||
eta = eta0 / pow(t, power_t)
|
||||
|
||||
if verbose or not early_stopping:
|
||||
sumloss += loss.loss(y, p)
|
||||
|
||||
if y > 0.0:
|
||||
class_weight = weight_pos
|
||||
else:
|
||||
class_weight = weight_neg
|
||||
|
||||
if learning_rate == PA1:
|
||||
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
||||
if update == 0:
|
||||
continue
|
||||
update = min(C, loss.loss(y, p) / update)
|
||||
elif learning_rate == PA2:
|
||||
update = sqnorm(x_data_ptr, x_ind_ptr, xnnz)
|
||||
update = loss.loss(y, p) / (update + 0.5 / C)
|
||||
else:
|
||||
dloss = loss.dloss(y, p)
|
||||
# clip dloss with large values to avoid numerical
|
||||
# instabilities
|
||||
if dloss < -MAX_DLOSS:
|
||||
dloss = -MAX_DLOSS
|
||||
elif dloss > MAX_DLOSS:
|
||||
dloss = MAX_DLOSS
|
||||
update = -eta * dloss
|
||||
|
||||
if learning_rate >= PA1:
|
||||
if is_hinge:
|
||||
# classification
|
||||
update *= y
|
||||
elif y - p < 0:
|
||||
# regression
|
||||
update *= -1
|
||||
|
||||
update *= class_weight * sample_weight
|
||||
|
||||
if penalty_type >= L2:
|
||||
# do not scale to negative values when eta or alpha are too
|
||||
# big: instead set the weights to zero
|
||||
w.scale(max(0, 1.0 - ((1.0 - l1_ratio) * eta * alpha)))
|
||||
|
||||
if update != 0.0:
|
||||
w.add(x_data_ptr, x_ind_ptr, xnnz, update)
|
||||
if fit_intercept == 1:
|
||||
intercept_update = update
|
||||
if one_class: # specific for One-Class SVM
|
||||
intercept_update -= 2. * eta * alpha
|
||||
if intercept_update != 0:
|
||||
intercept += intercept_update * intercept_decay
|
||||
|
||||
if 0 < average <= t:
|
||||
# compute the average for the intercept and update the
|
||||
# average weights, this is done regardless as to whether
|
||||
# the update is 0
|
||||
|
||||
w.add_average(x_data_ptr, x_ind_ptr, xnnz,
|
||||
update, (t - average + 1))
|
||||
average_intercept += ((intercept - average_intercept) /
|
||||
(t - average + 1))
|
||||
|
||||
if penalty_type == L1 or penalty_type == ELASTICNET:
|
||||
u += (l1_ratio * eta * alpha)
|
||||
l1penalty{{name_suffix}}(w, q_data_ptr, x_ind_ptr, xnnz, u)
|
||||
|
||||
t += 1
|
||||
count += 1
|
||||
|
||||
# report epoch information
|
||||
if verbose > 0:
|
||||
with gil:
|
||||
print("Norm: %.2f, NNZs: %d, Bias: %.6f, T: %d, "
|
||||
"Avg. loss: %f"
|
||||
% (w.norm(), np.nonzero(weights)[0].shape[0],
|
||||
intercept, count, sumloss / train_count))
|
||||
print("Total training time: %.2f seconds."
|
||||
% (time() - t_start))
|
||||
|
||||
# floating-point under-/overflow check.
|
||||
if (not isfinite(intercept) or any_nonfinite(weights)):
|
||||
infinity = True
|
||||
break
|
||||
|
||||
# evaluate the score on the validation set
|
||||
if early_stopping:
|
||||
with gil:
|
||||
score = validation_score_cb(weights.base, intercept)
|
||||
if tol > -INFINITY and score < best_score + tol:
|
||||
no_improvement_count += 1
|
||||
else:
|
||||
no_improvement_count = 0
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
# or evaluate the loss on the training set
|
||||
else:
|
||||
if tol > -INFINITY and sumloss > best_loss - tol * train_count:
|
||||
no_improvement_count += 1
|
||||
else:
|
||||
no_improvement_count = 0
|
||||
if sumloss < best_loss:
|
||||
best_loss = sumloss
|
||||
|
||||
# if there is no improvement several times in a row
|
||||
if no_improvement_count >= n_iter_no_change:
|
||||
if learning_rate == ADAPTIVE and eta > 1e-6:
|
||||
eta = eta / 5
|
||||
no_improvement_count = 0
|
||||
else:
|
||||
if verbose:
|
||||
with gil:
|
||||
print("Convergence after %d epochs took %.2f "
|
||||
"seconds" % (epoch + 1, time() - t_start))
|
||||
break
|
||||
|
||||
if infinity:
|
||||
raise ValueError(("Floating-point under-/overflow occurred at epoch"
|
||||
" #%d. Scaling input data with StandardScaler or"
|
||||
" MinMaxScaler might help.") % (epoch + 1))
|
||||
|
||||
w.reset_wscale()
|
||||
|
||||
return (
|
||||
weights.base,
|
||||
intercept,
|
||||
None if average_weights is None else average_weights.base,
|
||||
average_intercept,
|
||||
epoch + 1
|
||||
)
|
||||
|
||||
{{endfor}}
|
||||
|
||||
|
||||
cdef inline bint any_nonfinite(const floating[::1] w) noexcept nogil:
|
||||
for i in range(w.shape[0]):
|
||||
if not isfinite(w[i]):
|
||||
return True
|
||||
return 0
|
||||
|
||||
|
||||
cdef inline double sqnorm(
|
||||
floating * x_data_ptr,
|
||||
int * x_ind_ptr,
|
||||
int xnnz,
|
||||
) noexcept nogil:
|
||||
cdef double x_norm = 0.0
|
||||
cdef int j
|
||||
cdef double z
|
||||
for j in range(xnnz):
|
||||
z = x_data_ptr[j]
|
||||
x_norm += z * z
|
||||
return x_norm
|
||||
|
||||
|
||||
{{for name_suffix, c_type, np_type in dtypes}}
|
||||
|
||||
cdef void l1penalty{{name_suffix}}(
|
||||
WeightVector{{name_suffix}} w,
|
||||
{{c_type}} * q_data_ptr,
|
||||
int *x_ind_ptr,
|
||||
int xnnz,
|
||||
double u,
|
||||
) noexcept nogil:
|
||||
"""Apply the L1 penalty to each updated feature
|
||||
|
||||
This implements the truncated gradient approach by
|
||||
[Tsuruoka, Y., Tsujii, J., and Ananiadou, S., 2009].
|
||||
"""
|
||||
cdef double z = 0.0
|
||||
cdef int j = 0
|
||||
cdef int idx = 0
|
||||
cdef double wscale = w.wscale
|
||||
cdef {{c_type}} *w_data_ptr = w.w_data_ptr
|
||||
for j in range(xnnz):
|
||||
idx = x_ind_ptr[j]
|
||||
z = w_data_ptr[idx]
|
||||
if wscale * z > 0.0:
|
||||
w_data_ptr[idx] = max(
|
||||
0.0, w_data_ptr[idx] - ((u + q_data_ptr[idx]) / wscale))
|
||||
|
||||
elif wscale * z < 0.0:
|
||||
w_data_ptr[idx] = min(
|
||||
0.0, w_data_ptr[idx] + ((u - q_data_ptr[idx]) / wscale))
|
||||
|
||||
q_data_ptr[idx] += wscale * (w_data_ptr[idx] - z)
|
||||
|
||||
{{endfor}}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,456 @@
|
||||
"""
|
||||
A Theil-Sen Estimator for Multiple Linear Regression Model
|
||||
"""
|
||||
|
||||
# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import warnings
|
||||
from itertools import combinations
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from joblib import effective_n_jobs
|
||||
from scipy import linalg
|
||||
from scipy.linalg.lapack import get_lapack_funcs
|
||||
from scipy.special import binom
|
||||
|
||||
from ..base import RegressorMixin, _fit_context
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..utils import check_random_state
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.parallel import Parallel, delayed
|
||||
from ._base import LinearModel
|
||||
|
||||
_EPSILON = np.finfo(np.double).eps
|
||||
|
||||
|
||||
def _modified_weiszfeld_step(X, x_old):
|
||||
"""Modified Weiszfeld step.
|
||||
|
||||
This function defines one iteration step in order to approximate the
|
||||
spatial median (L1 median). It is a form of an iteratively re-weighted
|
||||
least squares method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
x_old : ndarray of shape = (n_features,)
|
||||
Current start vector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_new : ndarray of shape (n_features,)
|
||||
New iteration step.
|
||||
|
||||
References
|
||||
----------
|
||||
- On Computation of Spatial Median for Robust Data Mining, 2005
|
||||
T. Kärkkäinen and S. Äyrämö
|
||||
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
|
||||
"""
|
||||
diff = X - x_old
|
||||
diff_norm = np.sqrt(np.sum(diff**2, axis=1))
|
||||
mask = diff_norm >= _EPSILON
|
||||
# x_old equals one of our samples
|
||||
is_x_old_in_X = int(mask.sum() < X.shape[0])
|
||||
|
||||
diff = diff[mask]
|
||||
diff_norm = diff_norm[mask][:, np.newaxis]
|
||||
quotient_norm = linalg.norm(np.sum(diff / diff_norm, axis=0))
|
||||
|
||||
if quotient_norm > _EPSILON: # to avoid division by zero
|
||||
new_direction = np.sum(X[mask, :] / diff_norm, axis=0) / np.sum(
|
||||
1 / diff_norm, axis=0
|
||||
)
|
||||
else:
|
||||
new_direction = 1.0
|
||||
quotient_norm = 1.0
|
||||
|
||||
return (
|
||||
max(0.0, 1.0 - is_x_old_in_X / quotient_norm) * new_direction
|
||||
+ min(1.0, is_x_old_in_X / quotient_norm) * x_old
|
||||
)
|
||||
|
||||
|
||||
def _spatial_median(X, max_iter=300, tol=1.0e-3):
|
||||
"""Spatial median (L1 median).
|
||||
|
||||
The spatial median is member of a class of so-called M-estimators which
|
||||
are defined by an optimization problem. Given a number of p points in an
|
||||
n-dimensional space, the point x minimizing the sum of all distances to the
|
||||
p other points is called spatial median.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations.
|
||||
|
||||
tol : float, default=1.e-3
|
||||
Stop the algorithm if spatial_median has converged.
|
||||
|
||||
Returns
|
||||
-------
|
||||
spatial_median : ndarray of shape = (n_features,)
|
||||
Spatial median.
|
||||
|
||||
n_iter : int
|
||||
Number of iterations needed.
|
||||
|
||||
References
|
||||
----------
|
||||
- On Computation of Spatial Median for Robust Data Mining, 2005
|
||||
T. Kärkkäinen and S. Äyrämö
|
||||
http://users.jyu.fi/~samiayr/pdf/ayramo_eurogen05.pdf
|
||||
"""
|
||||
if X.shape[1] == 1:
|
||||
return 1, np.median(X.ravel(), keepdims=True)
|
||||
|
||||
tol **= 2 # We are computing the tol on the squared norm
|
||||
spatial_median_old = np.mean(X, axis=0)
|
||||
|
||||
for n_iter in range(max_iter):
|
||||
spatial_median = _modified_weiszfeld_step(X, spatial_median_old)
|
||||
if np.sum((spatial_median_old - spatial_median) ** 2) < tol:
|
||||
break
|
||||
else:
|
||||
spatial_median_old = spatial_median
|
||||
else:
|
||||
warnings.warn(
|
||||
"Maximum number of iterations {max_iter} reached in "
|
||||
"spatial median for TheilSen regressor."
|
||||
"".format(max_iter=max_iter),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
return n_iter, spatial_median
|
||||
|
||||
|
||||
def _breakdown_point(n_samples, n_subsamples):
|
||||
"""Approximation of the breakdown point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int
|
||||
Number of samples.
|
||||
|
||||
n_subsamples : int
|
||||
Number of subsamples to consider.
|
||||
|
||||
Returns
|
||||
-------
|
||||
breakdown_point : float
|
||||
Approximation of breakdown point.
|
||||
"""
|
||||
return (
|
||||
1
|
||||
- (
|
||||
0.5 ** (1 / n_subsamples) * (n_samples - n_subsamples + 1)
|
||||
+ n_subsamples
|
||||
- 1
|
||||
)
|
||||
/ n_samples
|
||||
)
|
||||
|
||||
|
||||
def _lstsq(X, y, indices, fit_intercept):
|
||||
"""Least Squares Estimator for TheilSenRegressor class.
|
||||
|
||||
This function calculates the least squares method on a subset of rows of X
|
||||
and y defined by the indices array. Optionally, an intercept column is
|
||||
added if intercept is set to true.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Design matrix, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target vector, where `n_samples` is the number of samples.
|
||||
|
||||
indices : ndarray of shape (n_subpopulation, n_subsamples)
|
||||
Indices of all subsamples with respect to the chosen subpopulation.
|
||||
|
||||
fit_intercept : bool
|
||||
Fit intercept or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
weights : ndarray of shape (n_subpopulation, n_features + intercept)
|
||||
Solution matrix of n_subpopulation solved least square problems.
|
||||
"""
|
||||
fit_intercept = int(fit_intercept)
|
||||
n_features = X.shape[1] + fit_intercept
|
||||
n_subsamples = indices.shape[1]
|
||||
weights = np.empty((indices.shape[0], n_features))
|
||||
X_subpopulation = np.ones((n_subsamples, n_features))
|
||||
# gelss need to pad y_subpopulation to be of the max dim of X_subpopulation
|
||||
y_subpopulation = np.zeros((max(n_subsamples, n_features)))
|
||||
(lstsq,) = get_lapack_funcs(("gelss",), (X_subpopulation, y_subpopulation))
|
||||
|
||||
for index, subset in enumerate(indices):
|
||||
X_subpopulation[:, fit_intercept:] = X[subset, :]
|
||||
y_subpopulation[:n_subsamples] = y[subset]
|
||||
weights[index] = lstsq(X_subpopulation, y_subpopulation)[1][:n_features]
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
class TheilSenRegressor(RegressorMixin, LinearModel):
|
||||
"""Theil-Sen Estimator: robust multivariate regression model.
|
||||
|
||||
The algorithm calculates least square solutions on subsets with size
|
||||
n_subsamples of the samples in X. Any value of n_subsamples between the
|
||||
number of features and samples leads to an estimator with a compromise
|
||||
between robustness and efficiency. Since the number of least square
|
||||
solutions is "n_samples choose n_subsamples", it can be extremely large
|
||||
and can therefore be limited with max_subpopulation. If this limit is
|
||||
reached, the subsets are chosen randomly. In a final step, the spatial
|
||||
median (or L1 median) is calculated of all least square solutions.
|
||||
|
||||
Read more in the :ref:`User Guide <theil_sen_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model. If set
|
||||
to false, no intercept will be used in calculations.
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
max_subpopulation : int, default=1e4
|
||||
Instead of computing with a set of cardinality 'n choose k', where n is
|
||||
the number of samples and k is the number of subsamples (at least
|
||||
number of features), consider only a stochastic subpopulation of a
|
||||
given maximal size if 'n choose k' is larger than max_subpopulation.
|
||||
For other than small problem sizes this parameter will determine
|
||||
memory usage and runtime if n_subsamples is not changed. Note that the
|
||||
data type should be int but floats such as 1e4 can be accepted too.
|
||||
|
||||
n_subsamples : int, default=None
|
||||
Number of samples to calculate the parameters. This is at least the
|
||||
number of features (plus 1 if fit_intercept=True) and the number of
|
||||
samples as a maximum. A lower number leads to a higher breakdown
|
||||
point and a low efficiency while a high number leads to a low
|
||||
breakdown point and a high efficiency. If None, take the
|
||||
minimum number of subsamples leading to maximal robustness.
|
||||
If n_subsamples is set to n_samples, Theil-Sen is identical to least
|
||||
squares.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations for the calculation of spatial median.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Tolerance when calculating spatial median.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A random number generator instance to define the state of the random
|
||||
permutations generator. Pass an int for reproducible output across
|
||||
multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int, default=None
|
||||
Number of CPUs to use during the cross validation.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : ndarray of shape (n_features,)
|
||||
Coefficients of the regression model (median of distribution).
|
||||
|
||||
intercept_ : float
|
||||
Estimated intercept of regression model.
|
||||
|
||||
breakdown_ : float
|
||||
Approximated breakdown point.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations needed for the spatial median.
|
||||
|
||||
n_subpopulation_ : int
|
||||
Number of combinations taken into account from 'n choose k', where n is
|
||||
the number of samples and k is the number of subsamples.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
HuberRegressor : Linear regression model that is robust to outliers.
|
||||
RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
|
||||
SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
|
||||
|
||||
References
|
||||
----------
|
||||
- Theil-Sen Estimators in a Multiple Linear Regression Model, 2009
|
||||
Xin Dang, Hanxiang Peng, Xueqin Wang and Heping Zhang
|
||||
http://home.olemiss.edu/~xdang/papers/MTSE.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.linear_model import TheilSenRegressor
|
||||
>>> from sklearn.datasets import make_regression
|
||||
>>> X, y = make_regression(
|
||||
... n_samples=200, n_features=2, noise=4.0, random_state=0)
|
||||
>>> reg = TheilSenRegressor(random_state=0).fit(X, y)
|
||||
>>> reg.score(X, y)
|
||||
0.9884...
|
||||
>>> reg.predict(X[:1,])
|
||||
array([-31.5871...])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"fit_intercept": ["boolean"],
|
||||
"copy_X": ["boolean"],
|
||||
# target_type should be Integral but can accept Real for backward compatibility
|
||||
"max_subpopulation": [Interval(Real, 1, None, closed="left")],
|
||||
"n_subsamples": [None, Integral],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="left")],
|
||||
"tol": [Interval(Real, 0.0, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [None, Integral],
|
||||
"verbose": ["verbose"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
fit_intercept=True,
|
||||
copy_X=True,
|
||||
max_subpopulation=1e4,
|
||||
n_subsamples=None,
|
||||
max_iter=300,
|
||||
tol=1.0e-3,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
verbose=False,
|
||||
):
|
||||
self.fit_intercept = fit_intercept
|
||||
self.copy_X = copy_X
|
||||
self.max_subpopulation = max_subpopulation
|
||||
self.n_subsamples = n_subsamples
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.random_state = random_state
|
||||
self.n_jobs = n_jobs
|
||||
self.verbose = verbose
|
||||
|
||||
def _check_subparams(self, n_samples, n_features):
|
||||
n_subsamples = self.n_subsamples
|
||||
|
||||
if self.fit_intercept:
|
||||
n_dim = n_features + 1
|
||||
else:
|
||||
n_dim = n_features
|
||||
|
||||
if n_subsamples is not None:
|
||||
if n_subsamples > n_samples:
|
||||
raise ValueError(
|
||||
"Invalid parameter since n_subsamples > "
|
||||
"n_samples ({0} > {1}).".format(n_subsamples, n_samples)
|
||||
)
|
||||
if n_samples >= n_features:
|
||||
if n_dim > n_subsamples:
|
||||
plus_1 = "+1" if self.fit_intercept else ""
|
||||
raise ValueError(
|
||||
"Invalid parameter since n_features{0} "
|
||||
"> n_subsamples ({1} > {2})."
|
||||
"".format(plus_1, n_dim, n_subsamples)
|
||||
)
|
||||
else: # if n_samples < n_features
|
||||
if n_subsamples != n_samples:
|
||||
raise ValueError(
|
||||
"Invalid parameter since n_subsamples != "
|
||||
"n_samples ({0} != {1}) while n_samples "
|
||||
"< n_features.".format(n_subsamples, n_samples)
|
||||
)
|
||||
else:
|
||||
n_subsamples = min(n_dim, n_samples)
|
||||
|
||||
all_combinations = max(1, np.rint(binom(n_samples, n_subsamples)))
|
||||
n_subpopulation = int(min(self.max_subpopulation, all_combinations))
|
||||
|
||||
return n_subsamples, n_subpopulation
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y):
|
||||
"""Fit linear model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Training data.
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
Fitted `TheilSenRegressor` estimator.
|
||||
"""
|
||||
random_state = check_random_state(self.random_state)
|
||||
X, y = self._validate_data(X, y, y_numeric=True)
|
||||
n_samples, n_features = X.shape
|
||||
n_subsamples, self.n_subpopulation_ = self._check_subparams(
|
||||
n_samples, n_features
|
||||
)
|
||||
self.breakdown_ = _breakdown_point(n_samples, n_subsamples)
|
||||
|
||||
if self.verbose:
|
||||
print("Breakdown point: {0}".format(self.breakdown_))
|
||||
print("Number of samples: {0}".format(n_samples))
|
||||
tol_outliers = int(self.breakdown_ * n_samples)
|
||||
print("Tolerable outliers: {0}".format(tol_outliers))
|
||||
print("Number of subpopulations: {0}".format(self.n_subpopulation_))
|
||||
|
||||
# Determine indices of subpopulation
|
||||
if np.rint(binom(n_samples, n_subsamples)) <= self.max_subpopulation:
|
||||
indices = list(combinations(range(n_samples), n_subsamples))
|
||||
else:
|
||||
indices = [
|
||||
random_state.choice(n_samples, size=n_subsamples, replace=False)
|
||||
for _ in range(self.n_subpopulation_)
|
||||
]
|
||||
|
||||
n_jobs = effective_n_jobs(self.n_jobs)
|
||||
index_list = np.array_split(indices, n_jobs)
|
||||
weights = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
|
||||
delayed(_lstsq)(X, y, index_list[job], self.fit_intercept)
|
||||
for job in range(n_jobs)
|
||||
)
|
||||
weights = np.vstack(weights)
|
||||
self.n_iter_, coefs = _spatial_median(
|
||||
weights, max_iter=self.max_iter, tol=self.tol
|
||||
)
|
||||
|
||||
if self.fit_intercept:
|
||||
self.intercept_ = coefs[0]
|
||||
self.coef_ = coefs[1:]
|
||||
else:
|
||||
self.intercept_ = 0.0
|
||||
self.coef_ = coefs
|
||||
|
||||
return self
|
||||
@@ -0,0 +1,35 @@
|
||||
# .pyx is generated, so this is needed to make Cython compilation work
|
||||
linear_model_cython_tree = [
|
||||
fs.copyfile('__init__.py'),
|
||||
fs.copyfile('_sgd_fast.pxd'),
|
||||
]
|
||||
|
||||
py.extension_module(
|
||||
'_cd_fast',
|
||||
['_cd_fast.pyx', utils_cython_tree],
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/linear_model',
|
||||
install: true
|
||||
)
|
||||
|
||||
name_list = ['_sgd_fast', '_sag_fast']
|
||||
|
||||
foreach name: name_list
|
||||
pyx = custom_target(
|
||||
name + '_pyx',
|
||||
output: name + '.pyx',
|
||||
input: name + '.pyx.tp',
|
||||
command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: [linear_model_cython_tree, utils_cython_tree],
|
||||
)
|
||||
py.extension_module(
|
||||
name,
|
||||
pyx,
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/linear_model',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,789 @@
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
# Maria Telenczuk <https://github.com/maikia>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg, sparse
|
||||
|
||||
from sklearn.datasets import load_iris, make_regression, make_sparse_uncorrelated
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.linear_model._base import (
|
||||
_preprocess_data,
|
||||
_rescale_data,
|
||||
make_dataset,
|
||||
)
|
||||
from sklearn.preprocessing import add_dummy_feature
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
|
||||
rtol = 1e-6
|
||||
|
||||
|
||||
def test_linear_regression():
|
||||
# Test LinearRegression on a simple dataset.
|
||||
# a simple dataset
|
||||
X = [[1], [2]]
|
||||
Y = [1, 2]
|
||||
|
||||
reg = LinearRegression()
|
||||
reg.fit(X, Y)
|
||||
|
||||
assert_array_almost_equal(reg.coef_, [1])
|
||||
assert_array_almost_equal(reg.intercept_, [0])
|
||||
assert_array_almost_equal(reg.predict(X), [1, 2])
|
||||
|
||||
# test it also for degenerate input
|
||||
X = [[1]]
|
||||
Y = [0]
|
||||
|
||||
reg = LinearRegression()
|
||||
reg.fit(X, Y)
|
||||
assert_array_almost_equal(reg.coef_, [0])
|
||||
assert_array_almost_equal(reg.intercept_, [0])
|
||||
assert_array_almost_equal(reg.predict(X), [0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
def test_linear_regression_sample_weights(
|
||||
sparse_container, fit_intercept, global_random_seed
|
||||
):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
# It would not work with under-determined systems
|
||||
n_samples, n_features = 6, 5
|
||||
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
y = rng.normal(size=n_samples)
|
||||
|
||||
sample_weight = 1.0 + rng.uniform(size=n_samples)
|
||||
|
||||
# LinearRegression with explicit sample_weight
|
||||
reg = LinearRegression(fit_intercept=fit_intercept)
|
||||
reg.fit(X, y, sample_weight=sample_weight)
|
||||
coefs1 = reg.coef_
|
||||
inter1 = reg.intercept_
|
||||
|
||||
assert reg.coef_.shape == (X.shape[1],) # sanity checks
|
||||
|
||||
# Closed form of the weighted least square
|
||||
# theta = (X^T W X)^(-1) @ X^T W y
|
||||
W = np.diag(sample_weight)
|
||||
X_aug = X if not fit_intercept else add_dummy_feature(X)
|
||||
|
||||
Xw = X_aug.T @ W @ X_aug
|
||||
yw = X_aug.T @ W @ y
|
||||
coefs2 = linalg.solve(Xw, yw)
|
||||
|
||||
if not fit_intercept:
|
||||
assert_allclose(coefs1, coefs2)
|
||||
else:
|
||||
assert_allclose(coefs1, coefs2[1:])
|
||||
assert_allclose(inter1, coefs2[0])
|
||||
|
||||
|
||||
def test_raises_value_error_if_positive_and_sparse():
|
||||
error_msg = "Sparse data was passed for X, but dense data is required."
|
||||
# X must not be sparse if positive == True
|
||||
X = sparse.eye(10)
|
||||
y = np.ones(10)
|
||||
|
||||
reg = LinearRegression(positive=True)
|
||||
|
||||
with pytest.raises(TypeError, match=error_msg):
|
||||
reg.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples, n_features", [(2, 3), (3, 2)])
|
||||
def test_raises_value_error_if_sample_weights_greater_than_1d(n_samples, n_features):
|
||||
# Sample weights must be either scalar or 1D
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
y = rng.randn(n_samples)
|
||||
sample_weights_OK = rng.randn(n_samples) ** 2 + 1
|
||||
sample_weights_OK_1 = 1.0
|
||||
sample_weights_OK_2 = 2.0
|
||||
|
||||
reg = LinearRegression()
|
||||
|
||||
# make sure the "OK" sample weights actually work
|
||||
reg.fit(X, y, sample_weights_OK)
|
||||
reg.fit(X, y, sample_weights_OK_1)
|
||||
reg.fit(X, y, sample_weights_OK_2)
|
||||
|
||||
|
||||
def test_fit_intercept():
|
||||
# Test assertions on betas shape.
|
||||
X2 = np.array([[0.38349978, 0.61650022], [0.58853682, 0.41146318]])
|
||||
X3 = np.array(
|
||||
[[0.27677969, 0.70693172, 0.01628859], [0.08385139, 0.20692515, 0.70922346]]
|
||||
)
|
||||
y = np.array([1, 1])
|
||||
|
||||
lr2_without_intercept = LinearRegression(fit_intercept=False).fit(X2, y)
|
||||
lr2_with_intercept = LinearRegression().fit(X2, y)
|
||||
|
||||
lr3_without_intercept = LinearRegression(fit_intercept=False).fit(X3, y)
|
||||
lr3_with_intercept = LinearRegression().fit(X3, y)
|
||||
|
||||
assert lr2_with_intercept.coef_.shape == lr2_without_intercept.coef_.shape
|
||||
assert lr3_with_intercept.coef_.shape == lr3_without_intercept.coef_.shape
|
||||
assert lr2_without_intercept.coef_.ndim == lr3_without_intercept.coef_.ndim
|
||||
|
||||
|
||||
def test_linear_regression_sparse(global_random_seed):
|
||||
# Test that linear regression also works with sparse data
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n = 100
|
||||
X = sparse.eye(n, n)
|
||||
beta = rng.rand(n)
|
||||
y = X @ beta
|
||||
|
||||
ols = LinearRegression()
|
||||
ols.fit(X, y.ravel())
|
||||
assert_array_almost_equal(beta, ols.coef_ + ols.intercept_)
|
||||
|
||||
assert_array_almost_equal(ols.predict(X) - y.ravel(), 0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_linear_regression_sparse_equal_dense(fit_intercept, csr_container):
|
||||
# Test that linear regression agrees between sparse and dense
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 200
|
||||
n_features = 2
|
||||
X = rng.randn(n_samples, n_features)
|
||||
X[X < 0.1] = 0.0
|
||||
Xcsr = csr_container(X)
|
||||
y = rng.rand(n_samples)
|
||||
params = dict(fit_intercept=fit_intercept)
|
||||
clf_dense = LinearRegression(**params)
|
||||
clf_sparse = LinearRegression(**params)
|
||||
clf_dense.fit(X, y)
|
||||
clf_sparse.fit(Xcsr, y)
|
||||
assert clf_dense.intercept_ == pytest.approx(clf_sparse.intercept_)
|
||||
assert_allclose(clf_dense.coef_, clf_sparse.coef_)
|
||||
|
||||
|
||||
def test_linear_regression_multiple_outcome():
|
||||
# Test multiple-outcome linear regressions
|
||||
rng = np.random.RandomState(0)
|
||||
X, y = make_regression(random_state=rng)
|
||||
|
||||
Y = np.vstack((y, y)).T
|
||||
n_features = X.shape[1]
|
||||
|
||||
reg = LinearRegression()
|
||||
reg.fit((X), Y)
|
||||
assert reg.coef_.shape == (2, n_features)
|
||||
Y_pred = reg.predict(X)
|
||||
reg.fit(X, y)
|
||||
y_pred = reg.predict(X)
|
||||
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_linear_regression_sparse_multiple_outcome(global_random_seed, coo_container):
|
||||
# Test multiple-outcome linear regressions with sparse data
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X, y = make_sparse_uncorrelated(random_state=rng)
|
||||
X = coo_container(X)
|
||||
Y = np.vstack((y, y)).T
|
||||
n_features = X.shape[1]
|
||||
|
||||
ols = LinearRegression()
|
||||
ols.fit(X, Y)
|
||||
assert ols.coef_.shape == (2, n_features)
|
||||
Y_pred = ols.predict(X)
|
||||
ols.fit(X, y.ravel())
|
||||
y_pred = ols.predict(X)
|
||||
assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
|
||||
|
||||
|
||||
def test_linear_regression_positive():
|
||||
# Test nonnegative LinearRegression on a simple dataset.
|
||||
X = [[1], [2]]
|
||||
y = [1, 2]
|
||||
|
||||
reg = LinearRegression(positive=True)
|
||||
reg.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(reg.coef_, [1])
|
||||
assert_array_almost_equal(reg.intercept_, [0])
|
||||
assert_array_almost_equal(reg.predict(X), [1, 2])
|
||||
|
||||
# test it also for degenerate input
|
||||
X = [[1]]
|
||||
y = [0]
|
||||
|
||||
reg = LinearRegression(positive=True)
|
||||
reg.fit(X, y)
|
||||
assert_allclose(reg.coef_, [0])
|
||||
assert_allclose(reg.intercept_, [0])
|
||||
assert_allclose(reg.predict(X), [0])
|
||||
|
||||
|
||||
def test_linear_regression_positive_multiple_outcome(global_random_seed):
|
||||
# Test multiple-outcome nonnegative linear regressions
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X, y = make_sparse_uncorrelated(random_state=rng)
|
||||
Y = np.vstack((y, y)).T
|
||||
n_features = X.shape[1]
|
||||
|
||||
ols = LinearRegression(positive=True)
|
||||
ols.fit(X, Y)
|
||||
assert ols.coef_.shape == (2, n_features)
|
||||
assert np.all(ols.coef_ >= 0.0)
|
||||
Y_pred = ols.predict(X)
|
||||
ols.fit(X, y.ravel())
|
||||
y_pred = ols.predict(X)
|
||||
assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
|
||||
|
||||
|
||||
def test_linear_regression_positive_vs_nonpositive(global_random_seed):
|
||||
# Test differences with LinearRegression when positive=False.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
X, y = make_sparse_uncorrelated(random_state=rng)
|
||||
|
||||
reg = LinearRegression(positive=True)
|
||||
reg.fit(X, y)
|
||||
regn = LinearRegression(positive=False)
|
||||
regn.fit(X, y)
|
||||
|
||||
assert np.mean((reg.coef_ - regn.coef_) ** 2) > 1e-3
|
||||
|
||||
|
||||
def test_linear_regression_positive_vs_nonpositive_when_positive(global_random_seed):
|
||||
# Test LinearRegression fitted coefficients
|
||||
# when the problem is positive.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 4
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = X[:, 0] + 2 * X[:, 1] + 3 * X[:, 2] + 1.5 * X[:, 3]
|
||||
|
||||
reg = LinearRegression(positive=True)
|
||||
reg.fit(X, y)
|
||||
regn = LinearRegression(positive=False)
|
||||
regn.fit(X, y)
|
||||
|
||||
assert np.mean((reg.coef_ - regn.coef_) ** 2) < 1e-6
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("use_sw", [True, False])
|
||||
def test_inplace_data_preprocessing(sparse_container, use_sw, global_random_seed):
|
||||
# Check that the data is not modified inplace by the linear regression
|
||||
# estimator.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
original_X_data = rng.randn(10, 12)
|
||||
original_y_data = rng.randn(10, 2)
|
||||
orginal_sw_data = rng.rand(10)
|
||||
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(original_X_data)
|
||||
else:
|
||||
X = original_X_data.copy()
|
||||
y = original_y_data.copy()
|
||||
# XXX: Note hat y_sparse is not supported (broken?) in the current
|
||||
# implementation of LinearRegression.
|
||||
|
||||
if use_sw:
|
||||
sample_weight = orginal_sw_data.copy()
|
||||
else:
|
||||
sample_weight = None
|
||||
|
||||
# Do not allow inplace preprocessing of X and y:
|
||||
reg = LinearRegression()
|
||||
reg.fit(X, y, sample_weight=sample_weight)
|
||||
if sparse_container is not None:
|
||||
assert_allclose(X.toarray(), original_X_data)
|
||||
else:
|
||||
assert_allclose(X, original_X_data)
|
||||
assert_allclose(y, original_y_data)
|
||||
|
||||
if use_sw:
|
||||
assert_allclose(sample_weight, orginal_sw_data)
|
||||
|
||||
# Allow inplace preprocessing of X and y
|
||||
reg = LinearRegression(copy_X=False)
|
||||
reg.fit(X, y, sample_weight=sample_weight)
|
||||
if sparse_container is not None:
|
||||
# No optimization relying on the inplace modification of sparse input
|
||||
# data has been implemented at this time.
|
||||
assert_allclose(X.toarray(), original_X_data)
|
||||
else:
|
||||
# X has been offset (and optionally rescaled by sample weights)
|
||||
# inplace. The 0.42 threshold is arbitrary and has been found to be
|
||||
# robust to any random seed in the admissible range.
|
||||
assert np.linalg.norm(X - original_X_data) > 0.42
|
||||
|
||||
# y should not have been modified inplace by LinearRegression.fit.
|
||||
assert_allclose(y, original_y_data)
|
||||
|
||||
if use_sw:
|
||||
# Sample weights have no reason to ever be modified inplace.
|
||||
assert_allclose(sample_weight, orginal_sw_data)
|
||||
|
||||
|
||||
def test_linear_regression_pd_sparse_dataframe_warning():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
# Warning is raised only when some of the columns is sparse
|
||||
df = pd.DataFrame({"0": np.random.randn(10)})
|
||||
for col in range(1, 4):
|
||||
arr = np.random.randn(10)
|
||||
arr[:8] = 0
|
||||
# all columns but the first column is sparse
|
||||
if col != 0:
|
||||
arr = pd.arrays.SparseArray(arr, fill_value=0)
|
||||
df[str(col)] = arr
|
||||
|
||||
msg = "pandas.DataFrame with sparse columns found."
|
||||
|
||||
reg = LinearRegression()
|
||||
with pytest.warns(UserWarning, match=msg):
|
||||
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
|
||||
|
||||
# does not warn when the whole dataframe is sparse
|
||||
df["0"] = pd.arrays.SparseArray(df["0"], fill_value=0)
|
||||
assert hasattr(df, "sparse")
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
reg.fit(df.iloc[:, 0:2], df.iloc[:, 3])
|
||||
|
||||
|
||||
def test_preprocess_data(global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 2
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
expected_X_mean = np.mean(X, axis=0)
|
||||
expected_y_mean = np.mean(y, axis=0)
|
||||
|
||||
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
|
||||
assert_array_almost_equal(X_mean, np.zeros(n_features))
|
||||
assert_array_almost_equal(y_mean, 0)
|
||||
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||||
assert_array_almost_equal(Xt, X)
|
||||
assert_array_almost_equal(yt, y)
|
||||
|
||||
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
|
||||
assert_array_almost_equal(X_mean, expected_X_mean)
|
||||
assert_array_almost_equal(y_mean, expected_y_mean)
|
||||
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||||
assert_array_almost_equal(Xt, X - expected_X_mean)
|
||||
assert_array_almost_equal(yt, y - expected_y_mean)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSC_CONTAINERS)
|
||||
def test_preprocess_data_multioutput(global_random_seed, sparse_container):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 3
|
||||
n_outputs = 2
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples, n_outputs)
|
||||
expected_y_mean = np.mean(y, axis=0)
|
||||
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
|
||||
_, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False)
|
||||
assert_array_almost_equal(y_mean, np.zeros(n_outputs))
|
||||
assert_array_almost_equal(yt, y)
|
||||
|
||||
_, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True)
|
||||
assert_array_almost_equal(y_mean, expected_y_mean)
|
||||
assert_array_almost_equal(yt, y - y_mean)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
def test_preprocess_data_weighted(sparse_container, global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 4
|
||||
# Generate random data with 50% of zero values to make sure
|
||||
# that the sparse variant of this test is actually sparse. This also
|
||||
# shifts the mean value for each columns in X further away from
|
||||
# zero.
|
||||
X = rng.rand(n_samples, n_features)
|
||||
X[X < 0.5] = 0.0
|
||||
|
||||
# Scale the first feature of X to be 10 larger than the other to
|
||||
# better check the impact of feature scaling.
|
||||
X[:, 0] *= 10
|
||||
|
||||
# Constant non-zero feature.
|
||||
X[:, 2] = 1.0
|
||||
|
||||
# Constant zero feature (non-materialized in the sparse case)
|
||||
X[:, 3] = 0.0
|
||||
y = rng.rand(n_samples)
|
||||
|
||||
sample_weight = rng.rand(n_samples)
|
||||
expected_X_mean = np.average(X, axis=0, weights=sample_weight)
|
||||
expected_y_mean = np.average(y, axis=0, weights=sample_weight)
|
||||
|
||||
X_sample_weight_avg = np.average(X, weights=sample_weight, axis=0)
|
||||
X_sample_weight_var = np.average(
|
||||
(X - X_sample_weight_avg) ** 2, weights=sample_weight, axis=0
|
||||
)
|
||||
constant_mask = X_sample_weight_var < 10 * np.finfo(X.dtype).eps
|
||||
assert_array_equal(constant_mask, [0, 0, 1, 1])
|
||||
expected_X_scale = np.sqrt(X_sample_weight_var) * np.sqrt(sample_weight.sum())
|
||||
|
||||
# near constant features should not be scaled
|
||||
expected_X_scale[constant_mask] = 1
|
||||
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
|
||||
# normalize is False
|
||||
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(
|
||||
X,
|
||||
y,
|
||||
fit_intercept=True,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
assert_array_almost_equal(X_mean, expected_X_mean)
|
||||
assert_array_almost_equal(y_mean, expected_y_mean)
|
||||
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||||
if sparse_container is not None:
|
||||
assert_array_almost_equal(Xt.toarray(), X.toarray())
|
||||
else:
|
||||
assert_array_almost_equal(Xt, X - expected_X_mean)
|
||||
assert_array_almost_equal(yt, y - expected_y_mean)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_sparse_preprocess_data_offsets(global_random_seed, lil_container):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 2
|
||||
X = sparse.rand(n_samples, n_features, density=0.5, random_state=rng)
|
||||
X = lil_container(X)
|
||||
y = rng.rand(n_samples)
|
||||
XA = X.toarray()
|
||||
|
||||
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=False)
|
||||
assert_array_almost_equal(X_mean, np.zeros(n_features))
|
||||
assert_array_almost_equal(y_mean, 0)
|
||||
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||||
assert_array_almost_equal(Xt.toarray(), XA)
|
||||
assert_array_almost_equal(yt, y)
|
||||
|
||||
Xt, yt, X_mean, y_mean, X_scale = _preprocess_data(X, y, fit_intercept=True)
|
||||
assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
|
||||
assert_array_almost_equal(y_mean, np.mean(y, axis=0))
|
||||
assert_array_almost_equal(X_scale, np.ones(n_features))
|
||||
assert_array_almost_equal(Xt.toarray(), XA)
|
||||
assert_array_almost_equal(yt, y - np.mean(y, axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_csr_preprocess_data(csr_container):
|
||||
# Test output format of _preprocess_data, when input is csr
|
||||
X, y = make_regression()
|
||||
X[X < 2.5] = 0.0
|
||||
csr = csr_container(X)
|
||||
csr_, y, _, _, _ = _preprocess_data(csr, y, fit_intercept=True)
|
||||
assert csr_.format == "csr"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("to_copy", (True, False))
|
||||
def test_preprocess_copy_data_no_checks(sparse_container, to_copy):
|
||||
X, y = make_regression()
|
||||
X[X < 2.5] = 0.0
|
||||
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
|
||||
X_, y_, _, _, _ = _preprocess_data(
|
||||
X, y, fit_intercept=True, copy=to_copy, check_input=False
|
||||
)
|
||||
|
||||
if to_copy and sparse_container is not None:
|
||||
assert not np.may_share_memory(X_.data, X.data)
|
||||
elif to_copy:
|
||||
assert not np.may_share_memory(X_, X)
|
||||
elif sparse_container is not None:
|
||||
assert np.may_share_memory(X_.data, X.data)
|
||||
else:
|
||||
assert np.may_share_memory(X_, X)
|
||||
|
||||
|
||||
def test_dtype_preprocess_data(global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 2
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
|
||||
X_32 = np.asarray(X, dtype=np.float32)
|
||||
y_32 = np.asarray(y, dtype=np.float32)
|
||||
X_64 = np.asarray(X, dtype=np.float64)
|
||||
y_64 = np.asarray(y, dtype=np.float64)
|
||||
|
||||
for fit_intercept in [True, False]:
|
||||
Xt_32, yt_32, X_mean_32, y_mean_32, X_scale_32 = _preprocess_data(
|
||||
X_32,
|
||||
y_32,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
Xt_64, yt_64, X_mean_64, y_mean_64, X_scale_64 = _preprocess_data(
|
||||
X_64,
|
||||
y_64,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_scale_3264 = _preprocess_data(
|
||||
X_32,
|
||||
y_64,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_scale_6432 = _preprocess_data(
|
||||
X_64,
|
||||
y_32,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
assert Xt_32.dtype == np.float32
|
||||
assert yt_32.dtype == np.float32
|
||||
assert X_mean_32.dtype == np.float32
|
||||
assert y_mean_32.dtype == np.float32
|
||||
assert X_scale_32.dtype == np.float32
|
||||
|
||||
assert Xt_64.dtype == np.float64
|
||||
assert yt_64.dtype == np.float64
|
||||
assert X_mean_64.dtype == np.float64
|
||||
assert y_mean_64.dtype == np.float64
|
||||
assert X_scale_64.dtype == np.float64
|
||||
|
||||
assert Xt_3264.dtype == np.float32
|
||||
assert yt_3264.dtype == np.float32
|
||||
assert X_mean_3264.dtype == np.float32
|
||||
assert y_mean_3264.dtype == np.float32
|
||||
assert X_scale_3264.dtype == np.float32
|
||||
|
||||
assert Xt_6432.dtype == np.float64
|
||||
assert yt_6432.dtype == np.float64
|
||||
assert X_mean_6432.dtype == np.float64
|
||||
assert y_mean_6432.dtype == np.float64
|
||||
assert X_scale_6432.dtype == np.float64
|
||||
|
||||
assert X_32.dtype == np.float32
|
||||
assert y_32.dtype == np.float32
|
||||
assert X_64.dtype == np.float64
|
||||
assert y_64.dtype == np.float64
|
||||
|
||||
assert_array_almost_equal(Xt_32, Xt_64)
|
||||
assert_array_almost_equal(yt_32, yt_64)
|
||||
assert_array_almost_equal(X_mean_32, X_mean_64)
|
||||
assert_array_almost_equal(y_mean_32, y_mean_64)
|
||||
assert_array_almost_equal(X_scale_32, X_scale_64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_targets", [None, 2])
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
def test_rescale_data(n_targets, sparse_container, global_random_seed):
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples = 200
|
||||
n_features = 2
|
||||
|
||||
sample_weight = 1.0 + rng.rand(n_samples)
|
||||
X = rng.rand(n_samples, n_features)
|
||||
if n_targets is None:
|
||||
y = rng.rand(n_samples)
|
||||
else:
|
||||
y = rng.rand(n_samples, n_targets)
|
||||
|
||||
expected_sqrt_sw = np.sqrt(sample_weight)
|
||||
expected_rescaled_X = X * expected_sqrt_sw[:, np.newaxis]
|
||||
|
||||
if n_targets is None:
|
||||
expected_rescaled_y = y * expected_sqrt_sw
|
||||
else:
|
||||
expected_rescaled_y = y * expected_sqrt_sw[:, np.newaxis]
|
||||
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
if n_targets is None:
|
||||
y = sparse_container(y.reshape(-1, 1))
|
||||
else:
|
||||
y = sparse_container(y)
|
||||
|
||||
rescaled_X, rescaled_y, sqrt_sw = _rescale_data(X, y, sample_weight)
|
||||
|
||||
assert_allclose(sqrt_sw, expected_sqrt_sw)
|
||||
|
||||
if sparse_container is not None:
|
||||
rescaled_X = rescaled_X.toarray()
|
||||
rescaled_y = rescaled_y.toarray()
|
||||
if n_targets is None:
|
||||
rescaled_y = rescaled_y.ravel()
|
||||
|
||||
assert_allclose(rescaled_X, expected_rescaled_X)
|
||||
assert_allclose(rescaled_y, expected_rescaled_y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_fused_types_make_dataset(csr_container):
|
||||
iris = load_iris()
|
||||
|
||||
X_32 = iris.data.astype(np.float32)
|
||||
y_32 = iris.target.astype(np.float32)
|
||||
X_csr_32 = csr_container(X_32)
|
||||
sample_weight_32 = np.arange(y_32.size, dtype=np.float32)
|
||||
|
||||
X_64 = iris.data.astype(np.float64)
|
||||
y_64 = iris.target.astype(np.float64)
|
||||
X_csr_64 = csr_container(X_64)
|
||||
sample_weight_64 = np.arange(y_64.size, dtype=np.float64)
|
||||
|
||||
# array
|
||||
dataset_32, _ = make_dataset(X_32, y_32, sample_weight_32)
|
||||
dataset_64, _ = make_dataset(X_64, y_64, sample_weight_64)
|
||||
xi_32, yi_32, _, _ = dataset_32._next_py()
|
||||
xi_64, yi_64, _, _ = dataset_64._next_py()
|
||||
xi_data_32, _, _ = xi_32
|
||||
xi_data_64, _, _ = xi_64
|
||||
|
||||
assert xi_data_32.dtype == np.float32
|
||||
assert xi_data_64.dtype == np.float64
|
||||
assert_allclose(yi_64, yi_32, rtol=rtol)
|
||||
|
||||
# csr
|
||||
datasetcsr_32, _ = make_dataset(X_csr_32, y_32, sample_weight_32)
|
||||
datasetcsr_64, _ = make_dataset(X_csr_64, y_64, sample_weight_64)
|
||||
xicsr_32, yicsr_32, _, _ = datasetcsr_32._next_py()
|
||||
xicsr_64, yicsr_64, _, _ = datasetcsr_64._next_py()
|
||||
xicsr_data_32, _, _ = xicsr_32
|
||||
xicsr_data_64, _, _ = xicsr_64
|
||||
|
||||
assert xicsr_data_32.dtype == np.float32
|
||||
assert xicsr_data_64.dtype == np.float64
|
||||
|
||||
assert_allclose(xicsr_data_64, xicsr_data_32, rtol=rtol)
|
||||
assert_allclose(yicsr_64, yicsr_32, rtol=rtol)
|
||||
|
||||
assert_array_equal(xi_data_32, xicsr_data_32)
|
||||
assert_array_equal(xi_data_64, xicsr_data_64)
|
||||
assert_array_equal(yi_32, yicsr_32)
|
||||
assert_array_equal(yi_64, yicsr_64)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None] + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
def test_linear_regression_sample_weight_consistency(
|
||||
sparse_container, fit_intercept, global_random_seed
|
||||
):
|
||||
"""Test that the impact of sample_weight is consistent.
|
||||
|
||||
Note that this test is stricter than the common test
|
||||
check_sample_weights_invariance alone and also tests sparse X.
|
||||
It is very similar to test_enet_sample_weight_consistency.
|
||||
"""
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_samples, n_features = 10, 5
|
||||
|
||||
X = rng.rand(n_samples, n_features)
|
||||
y = rng.rand(n_samples)
|
||||
if sparse_container is not None:
|
||||
X = sparse_container(X)
|
||||
params = dict(fit_intercept=fit_intercept)
|
||||
|
||||
reg = LinearRegression(**params).fit(X, y, sample_weight=None)
|
||||
coef = reg.coef_.copy()
|
||||
if fit_intercept:
|
||||
intercept = reg.intercept_
|
||||
|
||||
# 1) sample_weight=np.ones(..) must be equivalent to sample_weight=None
|
||||
# same check as check_sample_weights_invariance(name, reg, kind="ones"), but we also
|
||||
# test with sparse input.
|
||||
sample_weight = np.ones_like(y)
|
||||
reg.fit(X, y, sample_weight=sample_weight)
|
||||
assert_allclose(reg.coef_, coef, rtol=1e-6)
|
||||
if fit_intercept:
|
||||
assert_allclose(reg.intercept_, intercept)
|
||||
|
||||
# 2) sample_weight=None should be equivalent to sample_weight = number
|
||||
sample_weight = 123.0
|
||||
reg.fit(X, y, sample_weight=sample_weight)
|
||||
assert_allclose(reg.coef_, coef, rtol=1e-6)
|
||||
if fit_intercept:
|
||||
assert_allclose(reg.intercept_, intercept)
|
||||
|
||||
# 3) scaling of sample_weight should have no effect, cf. np.average()
|
||||
sample_weight = rng.uniform(low=0.01, high=2, size=X.shape[0])
|
||||
reg = reg.fit(X, y, sample_weight=sample_weight)
|
||||
coef = reg.coef_.copy()
|
||||
if fit_intercept:
|
||||
intercept = reg.intercept_
|
||||
|
||||
reg.fit(X, y, sample_weight=np.pi * sample_weight)
|
||||
assert_allclose(reg.coef_, coef, rtol=1e-6 if sparse_container is None else 1e-5)
|
||||
if fit_intercept:
|
||||
assert_allclose(reg.intercept_, intercept)
|
||||
|
||||
# 4) setting elements of sample_weight to 0 is equivalent to removing these samples
|
||||
sample_weight_0 = sample_weight.copy()
|
||||
sample_weight_0[-5:] = 0
|
||||
y[-5:] *= 1000 # to make excluding those samples important
|
||||
reg.fit(X, y, sample_weight=sample_weight_0)
|
||||
coef_0 = reg.coef_.copy()
|
||||
if fit_intercept:
|
||||
intercept_0 = reg.intercept_
|
||||
reg.fit(X[:-5], y[:-5], sample_weight=sample_weight[:-5])
|
||||
if fit_intercept and sparse_container is None:
|
||||
# FIXME: https://github.com/scikit-learn/scikit-learn/issues/26164
|
||||
# This often fails, e.g. when calling
|
||||
# SKLEARN_TESTS_GLOBAL_RANDOM_SEED="all" pytest \
|
||||
# sklearn/linear_model/tests/test_base.py\
|
||||
# ::test_linear_regression_sample_weight_consistency
|
||||
pass
|
||||
else:
|
||||
assert_allclose(reg.coef_, coef_0, rtol=1e-5)
|
||||
if fit_intercept:
|
||||
assert_allclose(reg.intercept_, intercept_0)
|
||||
|
||||
# 5) check that multiplying sample_weight by 2 is equivalent to repeating
|
||||
# corresponding samples twice
|
||||
if sparse_container is not None:
|
||||
X2 = sparse.vstack([X, X[: n_samples // 2]], format="csc")
|
||||
else:
|
||||
X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
|
||||
y2 = np.concatenate([y, y[: n_samples // 2]])
|
||||
sample_weight_1 = sample_weight.copy()
|
||||
sample_weight_1[: n_samples // 2] *= 2
|
||||
sample_weight_2 = np.concatenate(
|
||||
[sample_weight, sample_weight[: n_samples // 2]], axis=0
|
||||
)
|
||||
|
||||
reg1 = LinearRegression(**params).fit(X, y, sample_weight=sample_weight_1)
|
||||
reg2 = LinearRegression(**params).fit(X2, y2, sample_weight=sample_weight_2)
|
||||
assert_allclose(reg1.coef_, reg2.coef_, rtol=1e-6)
|
||||
if fit_intercept:
|
||||
assert_allclose(reg1.intercept_, reg2.intercept_)
|
||||
@@ -0,0 +1,299 @@
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from math import log
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.linear_model import ARDRegression, BayesianRidge, Ridge
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
from sklearn.utils.extmath import fast_logdet
|
||||
|
||||
diabetes = datasets.load_diabetes()
|
||||
|
||||
|
||||
def test_bayesian_ridge_scores():
|
||||
"""Check scores attribute shape"""
|
||||
X, y = diabetes.data, diabetes.target
|
||||
|
||||
clf = BayesianRidge(compute_score=True)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert clf.scores_.shape == (clf.n_iter_ + 1,)
|
||||
|
||||
|
||||
def test_bayesian_ridge_score_values():
|
||||
"""Check value of score on toy example.
|
||||
|
||||
Compute log marginal likelihood with equation (36) in Sparse Bayesian
|
||||
Learning and the Relevance Vector Machine (Tipping, 2001):
|
||||
|
||||
- 0.5 * (log |Id/alpha + X.X^T/lambda| +
|
||||
y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi))
|
||||
+ lambda_1 * log(lambda) - lambda_2 * lambda
|
||||
+ alpha_1 * log(alpha) - alpha_2 * alpha
|
||||
|
||||
and check equality with the score computed during training.
|
||||
"""
|
||||
|
||||
X, y = diabetes.data, diabetes.target
|
||||
n_samples = X.shape[0]
|
||||
# check with initial values of alpha and lambda (see code for the values)
|
||||
eps = np.finfo(np.float64).eps
|
||||
alpha_ = 1.0 / (np.var(y) + eps)
|
||||
lambda_ = 1.0
|
||||
|
||||
# value of the parameters of the Gamma hyperpriors
|
||||
alpha_1 = 0.1
|
||||
alpha_2 = 0.1
|
||||
lambda_1 = 0.1
|
||||
lambda_2 = 0.1
|
||||
|
||||
# compute score using formula of docstring
|
||||
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
|
||||
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
M = 1.0 / alpha_ * np.eye(n_samples) + 1.0 / lambda_ * np.dot(X, X.T)
|
||||
M_inv_dot_y = np.linalg.solve(M, y)
|
||||
score += -0.5 * (
|
||||
fast_logdet(M) + np.dot(y.T, M_inv_dot_y) + n_samples * log(2 * np.pi)
|
||||
)
|
||||
|
||||
# compute score with BayesianRidge
|
||||
clf = BayesianRidge(
|
||||
alpha_1=alpha_1,
|
||||
alpha_2=alpha_2,
|
||||
lambda_1=lambda_1,
|
||||
lambda_2=lambda_2,
|
||||
max_iter=1,
|
||||
fit_intercept=False,
|
||||
compute_score=True,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_almost_equal(clf.scores_[0], score, decimal=9)
|
||||
|
||||
|
||||
def test_bayesian_ridge_parameter():
|
||||
# Test correctness of lambda_ and alpha_ parameters (GitHub issue #8224)
|
||||
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
|
||||
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
|
||||
|
||||
# A Ridge regression model using an alpha value equal to the ratio of
|
||||
# lambda_ and alpha_ from the Bayesian Ridge model must be identical
|
||||
br_model = BayesianRidge(compute_score=True).fit(X, y)
|
||||
rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(X, y)
|
||||
assert_array_almost_equal(rr_model.coef_, br_model.coef_)
|
||||
assert_almost_equal(rr_model.intercept_, br_model.intercept_)
|
||||
|
||||
|
||||
def test_bayesian_sample_weights():
|
||||
# Test correctness of the sample_weights method
|
||||
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
|
||||
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
|
||||
w = np.array([4, 3, 3, 1, 1, 2, 3]).T
|
||||
|
||||
# A Ridge regression model using an alpha value equal to the ratio of
|
||||
# lambda_ and alpha_ from the Bayesian Ridge model must be identical
|
||||
br_model = BayesianRidge(compute_score=True).fit(X, y, sample_weight=w)
|
||||
rr_model = Ridge(alpha=br_model.lambda_ / br_model.alpha_).fit(
|
||||
X, y, sample_weight=w
|
||||
)
|
||||
assert_array_almost_equal(rr_model.coef_, br_model.coef_)
|
||||
assert_almost_equal(rr_model.intercept_, br_model.intercept_)
|
||||
|
||||
|
||||
def test_toy_bayesian_ridge_object():
|
||||
# Test BayesianRidge on toy
|
||||
X = np.array([[1], [2], [6], [8], [10]])
|
||||
Y = np.array([1, 2, 6, 8, 10])
|
||||
clf = BayesianRidge(compute_score=True)
|
||||
clf.fit(X, Y)
|
||||
|
||||
# Check that the model could approximately learn the identity function
|
||||
test = [[1], [3], [4]]
|
||||
assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
|
||||
|
||||
|
||||
def test_bayesian_initial_params():
|
||||
# Test BayesianRidge with initial values (alpha_init, lambda_init)
|
||||
X = np.vander(np.linspace(0, 4, 5), 4)
|
||||
y = np.array([0.0, 1.0, 0.0, -1.0, 0.0]) # y = (x^3 - 6x^2 + 8x) / 3
|
||||
|
||||
# In this case, starting from the default initial values will increase
|
||||
# the bias of the fitted curve. So, lambda_init should be small.
|
||||
reg = BayesianRidge(alpha_init=1.0, lambda_init=1e-3)
|
||||
# Check the R2 score nearly equals to one.
|
||||
r2 = reg.fit(X, y).score(X, y)
|
||||
assert_almost_equal(r2, 1.0)
|
||||
|
||||
|
||||
def test_prediction_bayesian_ridge_ard_with_constant_input():
|
||||
# Test BayesianRidge and ARDRegression predictions for edge case of
|
||||
# constant target vectors
|
||||
n_samples = 4
|
||||
n_features = 5
|
||||
random_state = check_random_state(42)
|
||||
constant_value = random_state.rand()
|
||||
X = random_state.random_sample((n_samples, n_features))
|
||||
y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
|
||||
expected = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
|
||||
|
||||
for clf in [BayesianRidge(), ARDRegression()]:
|
||||
y_pred = clf.fit(X, y).predict(X)
|
||||
assert_array_almost_equal(y_pred, expected)
|
||||
|
||||
|
||||
def test_std_bayesian_ridge_ard_with_constant_input():
|
||||
# Test BayesianRidge and ARDRegression standard dev. for edge case of
|
||||
# constant target vector
|
||||
# The standard dev. should be relatively small (< 0.01 is tested here)
|
||||
n_samples = 10
|
||||
n_features = 5
|
||||
random_state = check_random_state(42)
|
||||
constant_value = random_state.rand()
|
||||
X = random_state.random_sample((n_samples, n_features))
|
||||
y = np.full(n_samples, constant_value, dtype=np.array(constant_value).dtype)
|
||||
expected_upper_boundary = 0.01
|
||||
|
||||
for clf in [BayesianRidge(), ARDRegression()]:
|
||||
_, y_std = clf.fit(X, y).predict(X, return_std=True)
|
||||
assert_array_less(y_std, expected_upper_boundary)
|
||||
|
||||
|
||||
def test_update_of_sigma_in_ard():
|
||||
# Checks that `sigma_` is updated correctly after the last iteration
|
||||
# of the ARDRegression algorithm. See issue #10128.
|
||||
X = np.array([[1, 0], [0, 0]])
|
||||
y = np.array([0, 0])
|
||||
clf = ARDRegression(max_iter=1)
|
||||
clf.fit(X, y)
|
||||
# With the inputs above, ARDRegression prunes both of the two coefficients
|
||||
# in the first iteration. Hence, the expected shape of `sigma_` is (0, 0).
|
||||
assert clf.sigma_.shape == (0, 0)
|
||||
# Ensure that no error is thrown at prediction stage
|
||||
clf.predict(X, return_std=True)
|
||||
|
||||
|
||||
def test_toy_ard_object():
|
||||
# Test BayesianRegression ARD classifier
|
||||
X = np.array([[1], [2], [3]])
|
||||
Y = np.array([1, 2, 3])
|
||||
clf = ARDRegression(compute_score=True)
|
||||
clf.fit(X, Y)
|
||||
|
||||
# Check that the model could approximately learn the identity function
|
||||
test = [[1], [3], [4]]
|
||||
assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_samples, n_features", ((10, 100), (100, 10)))
|
||||
def test_ard_accuracy_on_easy_problem(global_random_seed, n_samples, n_features):
|
||||
# Check that ARD converges with reasonable accuracy on an easy problem
|
||||
# (Github issue #14055)
|
||||
X = np.random.RandomState(global_random_seed).normal(size=(250, 3))
|
||||
y = X[:, 1]
|
||||
|
||||
regressor = ARDRegression()
|
||||
regressor.fit(X, y)
|
||||
|
||||
abs_coef_error = np.abs(1 - regressor.coef_[1])
|
||||
assert abs_coef_error < 1e-10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("constructor_name", ["array", "dataframe"])
|
||||
def test_return_std(constructor_name):
|
||||
# Test return_std option for both Bayesian regressors
|
||||
def f(X):
|
||||
return np.dot(X, w) + b
|
||||
|
||||
def f_noise(X, noise_mult):
|
||||
return f(X) + np.random.randn(X.shape[0]) * noise_mult
|
||||
|
||||
d = 5
|
||||
n_train = 50
|
||||
n_test = 10
|
||||
|
||||
w = np.array([1.0, 0.0, 1.0, -1.0, 0.0])
|
||||
b = 1.0
|
||||
|
||||
X = np.random.random((n_train, d))
|
||||
X = _convert_container(X, constructor_name)
|
||||
|
||||
X_test = np.random.random((n_test, d))
|
||||
X_test = _convert_container(X_test, constructor_name)
|
||||
|
||||
for decimal, noise_mult in enumerate([1, 0.1, 0.01]):
|
||||
y = f_noise(X, noise_mult)
|
||||
|
||||
m1 = BayesianRidge()
|
||||
m1.fit(X, y)
|
||||
y_mean1, y_std1 = m1.predict(X_test, return_std=True)
|
||||
assert_array_almost_equal(y_std1, noise_mult, decimal=decimal)
|
||||
|
||||
m2 = ARDRegression()
|
||||
m2.fit(X, y)
|
||||
y_mean2, y_std2 = m2.predict(X_test, return_std=True)
|
||||
assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
|
||||
|
||||
|
||||
def test_update_sigma(global_random_seed):
|
||||
# make sure the two update_sigma() helpers are equivalent. The woodbury
|
||||
# formula is used when n_samples < n_features, and the other one is used
|
||||
# otherwise.
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
|
||||
# set n_samples == n_features to avoid instability issues when inverting
|
||||
# the matrices. Using the woodbury formula would be unstable when
|
||||
# n_samples > n_features
|
||||
n_samples = n_features = 10
|
||||
X = rng.randn(n_samples, n_features)
|
||||
alpha = 1
|
||||
lmbda = np.arange(1, n_features + 1)
|
||||
keep_lambda = np.array([True] * n_features)
|
||||
|
||||
reg = ARDRegression()
|
||||
|
||||
sigma = reg._update_sigma(X, alpha, lmbda, keep_lambda)
|
||||
sigma_woodbury = reg._update_sigma_woodbury(X, alpha, lmbda, keep_lambda)
|
||||
|
||||
np.testing.assert_allclose(sigma, sigma_woodbury)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
|
||||
def test_dtype_match(dtype, Estimator):
|
||||
# Test that np.float32 input data is not cast to np.float64 when possible
|
||||
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]], dtype=dtype)
|
||||
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
|
||||
|
||||
model = Estimator()
|
||||
# check type consistency
|
||||
model.fit(X, y)
|
||||
attributes = ["coef_", "sigma_"]
|
||||
for attribute in attributes:
|
||||
assert getattr(model, attribute).dtype == X.dtype
|
||||
|
||||
y_mean, y_std = model.predict(X, return_std=True)
|
||||
assert y_mean.dtype == X.dtype
|
||||
assert y_std.dtype == X.dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Estimator", [BayesianRidge, ARDRegression])
|
||||
def test_dtype_correctness(Estimator):
|
||||
X = np.array([[1, 1], [3, 4], [5, 7], [4, 1], [2, 6], [3, 10], [3, 2]])
|
||||
y = np.array([1, 2, 3, 2, 0, 4, 5]).T
|
||||
model = Estimator()
|
||||
coef_32 = model.fit(X.astype(np.float32), y).coef_
|
||||
coef_64 = model.fit(X.astype(np.float64), y).coef_
|
||||
np.testing.assert_allclose(coef_32, coef_64, rtol=1e-4)
|
||||
@@ -0,0 +1,147 @@
|
||||
# License: BSD 3 clause
|
||||
|
||||
import inspect
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import is_classifier
|
||||
from sklearn.datasets import make_low_rank_matrix
|
||||
from sklearn.linear_model import (
|
||||
ARDRegression,
|
||||
BayesianRidge,
|
||||
ElasticNet,
|
||||
ElasticNetCV,
|
||||
Lars,
|
||||
LarsCV,
|
||||
Lasso,
|
||||
LassoCV,
|
||||
LassoLarsCV,
|
||||
LassoLarsIC,
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
LogisticRegressionCV,
|
||||
MultiTaskElasticNet,
|
||||
MultiTaskElasticNetCV,
|
||||
MultiTaskLasso,
|
||||
MultiTaskLassoCV,
|
||||
OrthogonalMatchingPursuit,
|
||||
OrthogonalMatchingPursuitCV,
|
||||
PoissonRegressor,
|
||||
Ridge,
|
||||
RidgeCV,
|
||||
SGDRegressor,
|
||||
TweedieRegressor,
|
||||
)
|
||||
|
||||
|
||||
# Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link.
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
ARDRegression(),
|
||||
BayesianRidge(),
|
||||
ElasticNet(),
|
||||
ElasticNetCV(),
|
||||
Lars(),
|
||||
LarsCV(),
|
||||
Lasso(),
|
||||
LassoCV(),
|
||||
LassoLarsCV(),
|
||||
LassoLarsIC(),
|
||||
LinearRegression(),
|
||||
# TODO: FIx SAGA which fails badly with sample_weights.
|
||||
# This is a known limitation, see:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/21305
|
||||
pytest.param(
|
||||
LogisticRegression(
|
||||
penalty="elasticnet", solver="saga", l1_ratio=0.5, tol=1e-15
|
||||
),
|
||||
marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
|
||||
),
|
||||
LogisticRegressionCV(tol=1e-6),
|
||||
MultiTaskElasticNet(),
|
||||
MultiTaskElasticNetCV(),
|
||||
MultiTaskLasso(),
|
||||
MultiTaskLassoCV(),
|
||||
OrthogonalMatchingPursuit(),
|
||||
OrthogonalMatchingPursuitCV(),
|
||||
PoissonRegressor(),
|
||||
Ridge(),
|
||||
RidgeCV(),
|
||||
pytest.param(
|
||||
SGDRegressor(tol=1e-15),
|
||||
marks=pytest.mark.xfail(reason="Insufficient precision."),
|
||||
),
|
||||
SGDRegressor(penalty="elasticnet", max_iter=10_000),
|
||||
TweedieRegressor(power=0), # same as Ridge
|
||||
],
|
||||
ids=lambda x: x.__class__.__name__,
|
||||
)
|
||||
@pytest.mark.parametrize("with_sample_weight", [False, True])
|
||||
def test_balance_property(model, with_sample_weight, global_random_seed):
|
||||
# Test that sum(y_predicted) == sum(y_observed) on the training set.
|
||||
# This must hold for all linear models with deviance of an exponential disperson
|
||||
# family as loss and the corresponding canonical link if fit_intercept=True.
|
||||
# Examples:
|
||||
# - squared error and identity link (most linear models)
|
||||
# - Poisson deviance with log link
|
||||
# - log loss with logit link
|
||||
# This is known as balance property or unconditional calibration/unbiasedness.
|
||||
# For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of
|
||||
# M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its
|
||||
# Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407
|
||||
|
||||
if (
|
||||
with_sample_weight
|
||||
and "sample_weight" not in inspect.signature(model.fit).parameters.keys()
|
||||
):
|
||||
pytest.skip("Estimator does not support sample_weight.")
|
||||
|
||||
rel = 2e-4 # test precision
|
||||
if isinstance(model, SGDRegressor):
|
||||
rel = 1e-1
|
||||
elif hasattr(model, "solver") and model.solver == "saga":
|
||||
rel = 1e-2
|
||||
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
n_train, n_features, n_targets = 100, 10, None
|
||||
if isinstance(
|
||||
model,
|
||||
(MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV),
|
||||
):
|
||||
n_targets = 3
|
||||
X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng)
|
||||
if n_targets:
|
||||
coef = (
|
||||
rng.uniform(low=-2, high=2, size=(n_features, n_targets))
|
||||
/ np.max(X, axis=0)[:, None]
|
||||
)
|
||||
else:
|
||||
coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
|
||||
|
||||
expectation = np.exp(X @ coef + 0.5)
|
||||
y = rng.poisson(lam=expectation) + 1 # strict positive, i.e. y > 0
|
||||
if is_classifier(model):
|
||||
y = (y > expectation + 1).astype(np.float64)
|
||||
|
||||
if with_sample_weight:
|
||||
sw = rng.uniform(low=1, high=10, size=y.shape[0])
|
||||
else:
|
||||
sw = None
|
||||
|
||||
model.set_params(fit_intercept=True) # to be sure
|
||||
if with_sample_weight:
|
||||
model.fit(X, y, sample_weight=sw)
|
||||
else:
|
||||
model.fit(X, y)
|
||||
|
||||
# Assert balance property.
|
||||
if is_classifier(model):
|
||||
assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx(
|
||||
np.average(y, weights=sw), rel=rel
|
||||
)
|
||||
else:
|
||||
assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx(
|
||||
np.average(y, weights=sw, axis=0), rel=rel
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,216 @@
|
||||
# Authors: Manoj Kumar mks542@nyu.edu
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import optimize
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge, SGDRegressor
|
||||
from sklearn.linear_model._huber import _huber_loss_and_gradient
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def make_regression_with_outliers(n_samples=50, n_features=20):
|
||||
rng = np.random.RandomState(0)
|
||||
# Generate data with outliers by replacing 10% of the samples with noise.
|
||||
X, y = make_regression(
|
||||
n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05
|
||||
)
|
||||
|
||||
# Replace 10% of the sample with noise.
|
||||
num_noise = int(0.1 * n_samples)
|
||||
random_samples = rng.randint(0, n_samples, num_noise)
|
||||
X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
|
||||
return X, y
|
||||
|
||||
|
||||
def test_huber_equals_lr_for_high_epsilon():
|
||||
# Test that Ridge matches LinearRegression for large epsilon
|
||||
X, y = make_regression_with_outliers()
|
||||
lr = LinearRegression()
|
||||
lr.fit(X, y)
|
||||
huber = HuberRegressor(epsilon=1e3, alpha=0.0)
|
||||
huber.fit(X, y)
|
||||
assert_almost_equal(huber.coef_, lr.coef_, 3)
|
||||
assert_almost_equal(huber.intercept_, lr.intercept_, 2)
|
||||
|
||||
|
||||
def test_huber_max_iter():
|
||||
X, y = make_regression_with_outliers()
|
||||
huber = HuberRegressor(max_iter=1)
|
||||
huber.fit(X, y)
|
||||
assert huber.n_iter_ == huber.max_iter
|
||||
|
||||
|
||||
def test_huber_gradient():
|
||||
# Test that the gradient calculated by _huber_loss_and_gradient is correct
|
||||
rng = np.random.RandomState(1)
|
||||
X, y = make_regression_with_outliers()
|
||||
sample_weight = rng.randint(1, 3, (y.shape[0]))
|
||||
|
||||
def loss_func(x, *args):
|
||||
return _huber_loss_and_gradient(x, *args)[0]
|
||||
|
||||
def grad_func(x, *args):
|
||||
return _huber_loss_and_gradient(x, *args)[1]
|
||||
|
||||
# Check using optimize.check_grad that the gradients are equal.
|
||||
for _ in range(5):
|
||||
# Check for both fit_intercept and otherwise.
|
||||
for n_features in [X.shape[1] + 1, X.shape[1] + 2]:
|
||||
w = rng.randn(n_features)
|
||||
w[-1] = np.abs(w[-1])
|
||||
grad_same = optimize.check_grad(
|
||||
loss_func, grad_func, w, X, y, 0.01, 0.1, sample_weight
|
||||
)
|
||||
assert_almost_equal(grad_same, 1e-6, 4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_huber_sample_weights(csr_container):
|
||||
# Test sample_weights implementation in HuberRegressor"""
|
||||
|
||||
X, y = make_regression_with_outliers()
|
||||
huber = HuberRegressor()
|
||||
huber.fit(X, y)
|
||||
huber_coef = huber.coef_
|
||||
huber_intercept = huber.intercept_
|
||||
|
||||
# Rescale coefs before comparing with assert_array_almost_equal to make
|
||||
# sure that the number of decimal places used is somewhat insensitive to
|
||||
# the amplitude of the coefficients and therefore to the scale of the
|
||||
# data and the regularization parameter
|
||||
scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_)))
|
||||
|
||||
huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
|
||||
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
|
||||
assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
|
||||
|
||||
X, y = make_regression_with_outliers(n_samples=5, n_features=20)
|
||||
X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
|
||||
y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
|
||||
huber.fit(X_new, y_new)
|
||||
huber_coef = huber.coef_
|
||||
huber_intercept = huber.intercept_
|
||||
sample_weight = np.ones(X.shape[0])
|
||||
sample_weight[1] = 3
|
||||
sample_weight[3] = 2
|
||||
huber.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
|
||||
assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
|
||||
|
||||
# Test sparse implementation with sample weights.
|
||||
X_csr = csr_container(X)
|
||||
huber_sparse = HuberRegressor()
|
||||
huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
|
||||
assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_huber_sparse(csr_container):
|
||||
X, y = make_regression_with_outliers()
|
||||
huber = HuberRegressor(alpha=0.1)
|
||||
huber.fit(X, y)
|
||||
|
||||
X_csr = csr_container(X)
|
||||
huber_sparse = HuberRegressor(alpha=0.1)
|
||||
huber_sparse.fit(X_csr, y)
|
||||
assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
|
||||
assert_array_equal(huber.outliers_, huber_sparse.outliers_)
|
||||
|
||||
|
||||
def test_huber_scaling_invariant():
|
||||
# Test that outliers filtering is scaling independent.
|
||||
X, y = make_regression_with_outliers()
|
||||
huber = HuberRegressor(fit_intercept=False, alpha=0.0)
|
||||
huber.fit(X, y)
|
||||
n_outliers_mask_1 = huber.outliers_
|
||||
assert not np.all(n_outliers_mask_1)
|
||||
|
||||
huber.fit(X, 2.0 * y)
|
||||
n_outliers_mask_2 = huber.outliers_
|
||||
assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
|
||||
|
||||
huber.fit(2.0 * X, 2.0 * y)
|
||||
n_outliers_mask_3 = huber.outliers_
|
||||
assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
|
||||
|
||||
|
||||
def test_huber_and_sgd_same_results():
|
||||
# Test they should converge to same coefficients for same parameters
|
||||
|
||||
X, y = make_regression_with_outliers(n_samples=10, n_features=2)
|
||||
|
||||
# Fit once to find out the scale parameter. Scale down X and y by scale
|
||||
# so that the scale parameter is optimized to 1.0
|
||||
huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35)
|
||||
huber.fit(X, y)
|
||||
X_scale = X / huber.scale_
|
||||
y_scale = y / huber.scale_
|
||||
huber.fit(X_scale, y_scale)
|
||||
assert_almost_equal(huber.scale_, 1.0, 3)
|
||||
|
||||
sgdreg = SGDRegressor(
|
||||
alpha=0.0,
|
||||
loss="huber",
|
||||
shuffle=True,
|
||||
random_state=0,
|
||||
max_iter=10000,
|
||||
fit_intercept=False,
|
||||
epsilon=1.35,
|
||||
tol=None,
|
||||
)
|
||||
sgdreg.fit(X_scale, y_scale)
|
||||
assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
|
||||
|
||||
|
||||
def test_huber_warm_start():
|
||||
X, y = make_regression_with_outliers()
|
||||
huber_warm = HuberRegressor(alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
|
||||
|
||||
huber_warm.fit(X, y)
|
||||
huber_warm_coef = huber_warm.coef_.copy()
|
||||
huber_warm.fit(X, y)
|
||||
|
||||
# SciPy performs the tol check after doing the coef updates, so
|
||||
# these would be almost same but not equal.
|
||||
assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)
|
||||
|
||||
assert huber_warm.n_iter_ == 0
|
||||
|
||||
|
||||
def test_huber_better_r2_score():
|
||||
# Test that huber returns a better r2 score than non-outliers"""
|
||||
X, y = make_regression_with_outliers()
|
||||
huber = HuberRegressor(alpha=0.01)
|
||||
huber.fit(X, y)
|
||||
linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
|
||||
mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
|
||||
huber_score = huber.score(X[mask], y[mask])
|
||||
huber_outlier_score = huber.score(X[~mask], y[~mask])
|
||||
|
||||
# The Ridge regressor should be influenced by the outliers and hence
|
||||
# give a worse score on the non-outliers as compared to the huber
|
||||
# regressor.
|
||||
ridge = Ridge(alpha=0.01)
|
||||
ridge.fit(X, y)
|
||||
ridge_score = ridge.score(X[mask], y[mask])
|
||||
ridge_outlier_score = ridge.score(X[~mask], y[~mask])
|
||||
assert huber_score > ridge_score
|
||||
|
||||
# The huber model should also fit poorly on the outliers.
|
||||
assert ridge_outlier_score > huber_outlier_score
|
||||
|
||||
|
||||
def test_huber_bool():
|
||||
# Test that it does not crash with bool data
|
||||
X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0)
|
||||
X_bool = X > 0
|
||||
HuberRegressor().fit(X_bool, y)
|
||||
@@ -0,0 +1,870 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import datasets, linear_model
|
||||
from sklearn.base import clone
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import (
|
||||
Lars,
|
||||
LarsCV,
|
||||
LassoLars,
|
||||
LassoLarsCV,
|
||||
LassoLarsIC,
|
||||
lars_path,
|
||||
)
|
||||
from sklearn.linear_model._least_angle import _lars_path_residues
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils._testing import (
|
||||
TempMemmap,
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
# TODO: use another dataset that has multiple drops
|
||||
diabetes = datasets.load_diabetes()
|
||||
X, y = diabetes.data, diabetes.target
|
||||
G = np.dot(X.T, X)
|
||||
Xy = np.dot(X.T, y)
|
||||
n_samples = y.size
|
||||
|
||||
|
||||
def test_simple():
|
||||
# Principle of Lars is to keep covariances tied and decreasing
|
||||
|
||||
# also test verbose output
|
||||
import sys
|
||||
from io import StringIO
|
||||
|
||||
old_stdout = sys.stdout
|
||||
try:
|
||||
sys.stdout = StringIO()
|
||||
|
||||
_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", verbose=10)
|
||||
|
||||
sys.stdout = old_stdout
|
||||
|
||||
for i, coef_ in enumerate(coef_path_.T):
|
||||
res = y - np.dot(X, coef_)
|
||||
cov = np.dot(X.T, res)
|
||||
C = np.max(abs(cov))
|
||||
eps = 1e-3
|
||||
ocur = len(cov[C - eps < abs(cov)])
|
||||
if i < X.shape[1]:
|
||||
assert ocur == i + 1
|
||||
else:
|
||||
# no more than max_pred variables can go into the active set
|
||||
assert ocur == X.shape[1]
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
|
||||
def test_simple_precomputed():
|
||||
# The same, with precomputed Gram matrix
|
||||
|
||||
_, _, coef_path_ = linear_model.lars_path(X, y, Gram=G, method="lar")
|
||||
|
||||
for i, coef_ in enumerate(coef_path_.T):
|
||||
res = y - np.dot(X, coef_)
|
||||
cov = np.dot(X.T, res)
|
||||
C = np.max(abs(cov))
|
||||
eps = 1e-3
|
||||
ocur = len(cov[C - eps < abs(cov)])
|
||||
if i < X.shape[1]:
|
||||
assert ocur == i + 1
|
||||
else:
|
||||
# no more than max_pred variables can go into the active set
|
||||
assert ocur == X.shape[1]
|
||||
|
||||
|
||||
def _assert_same_lars_path_result(output1, output2):
|
||||
assert len(output1) == len(output2)
|
||||
for o1, o2 in zip(output1, output2):
|
||||
assert_allclose(o1, o2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["lar", "lasso"])
|
||||
@pytest.mark.parametrize("return_path", [True, False])
|
||||
def test_lars_path_gram_equivalent(method, return_path):
|
||||
_assert_same_lars_path_result(
|
||||
linear_model.lars_path_gram(
|
||||
Xy=Xy, Gram=G, n_samples=n_samples, method=method, return_path=return_path
|
||||
),
|
||||
linear_model.lars_path(X, y, Gram=G, method=method, return_path=return_path),
|
||||
)
|
||||
|
||||
|
||||
def test_x_none_gram_none_raises_value_error():
|
||||
# Test that lars_path with no X and Gram raises exception
|
||||
Xy = np.dot(X.T, y)
|
||||
with pytest.raises(ValueError, match="X and Gram cannot both be unspecified"):
|
||||
linear_model.lars_path(None, y, Gram=None, Xy=Xy)
|
||||
|
||||
|
||||
def test_all_precomputed():
|
||||
# Test that lars_path with precomputed Gram and Xy gives the right answer
|
||||
G = np.dot(X.T, X)
|
||||
Xy = np.dot(X.T, y)
|
||||
for method in "lar", "lasso":
|
||||
output = linear_model.lars_path(X, y, method=method)
|
||||
output_pre = linear_model.lars_path(X, y, Gram=G, Xy=Xy, method=method)
|
||||
for expected, got in zip(output, output_pre):
|
||||
assert_array_almost_equal(expected, got)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore: `rcond` parameter will change")
|
||||
# numpy deprecation
|
||||
def test_lars_lstsq():
|
||||
# Test that Lars gives least square solution at the end
|
||||
# of the path
|
||||
X1 = 3 * X # use un-normalized dataset
|
||||
clf = linear_model.LassoLars(alpha=0.0)
|
||||
clf.fit(X1, y)
|
||||
coef_lstsq = np.linalg.lstsq(X1, y, rcond=None)[0]
|
||||
assert_array_almost_equal(clf.coef_, coef_lstsq)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:`rcond` parameter will change")
|
||||
# numpy deprecation
|
||||
def test_lasso_gives_lstsq_solution():
|
||||
# Test that Lars Lasso gives least square solution at the end
|
||||
# of the path
|
||||
_, _, coef_path_ = linear_model.lars_path(X, y, method="lasso")
|
||||
coef_lstsq = np.linalg.lstsq(X, y)[0]
|
||||
assert_array_almost_equal(coef_lstsq, coef_path_[:, -1])
|
||||
|
||||
|
||||
def test_collinearity():
|
||||
# Check that lars_path is robust to collinearity in input
|
||||
X = np.array([[3.0, 3.0, 1.0], [2.0, 2.0, 0.0], [1.0, 1.0, 0]])
|
||||
y = np.array([1.0, 0.0, 0])
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
f = ignore_warnings
|
||||
_, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01)
|
||||
assert not np.isnan(coef_path_).any()
|
||||
residual = np.dot(X, coef_path_[:, -1]) - y
|
||||
assert (residual**2).sum() < 1.0 # just make sure it's bounded
|
||||
|
||||
n_samples = 10
|
||||
X = rng.rand(n_samples, 5)
|
||||
y = np.zeros(n_samples)
|
||||
_, _, coef_path_ = linear_model.lars_path(
|
||||
X,
|
||||
y,
|
||||
Gram="auto",
|
||||
copy_X=False,
|
||||
copy_Gram=False,
|
||||
alpha_min=0.0,
|
||||
method="lasso",
|
||||
verbose=0,
|
||||
max_iter=500,
|
||||
)
|
||||
assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
|
||||
|
||||
|
||||
def test_no_path():
|
||||
# Test that the ``return_path=False`` option returns the correct output
|
||||
alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar")
|
||||
alpha_, _, coef = linear_model.lars_path(X, y, method="lar", return_path=False)
|
||||
|
||||
assert_array_almost_equal(coef, coef_path_[:, -1])
|
||||
assert alpha_ == alphas_[-1]
|
||||
|
||||
|
||||
def test_no_path_precomputed():
|
||||
# Test that the ``return_path=False`` option with Gram remains correct
|
||||
alphas_, _, coef_path_ = linear_model.lars_path(X, y, method="lar", Gram=G)
|
||||
alpha_, _, coef = linear_model.lars_path(
|
||||
X, y, method="lar", Gram=G, return_path=False
|
||||
)
|
||||
|
||||
assert_array_almost_equal(coef, coef_path_[:, -1])
|
||||
assert alpha_ == alphas_[-1]
|
||||
|
||||
|
||||
def test_no_path_all_precomputed():
|
||||
# Test that the ``return_path=False`` option with Gram and Xy remains
|
||||
# correct
|
||||
X, y = 3 * diabetes.data, diabetes.target
|
||||
G = np.dot(X.T, X)
|
||||
Xy = np.dot(X.T, y)
|
||||
alphas_, _, coef_path_ = linear_model.lars_path(
|
||||
X, y, method="lasso", Xy=Xy, Gram=G, alpha_min=0.9
|
||||
)
|
||||
alpha_, _, coef = linear_model.lars_path(
|
||||
X, y, method="lasso", Gram=G, Xy=Xy, alpha_min=0.9, return_path=False
|
||||
)
|
||||
|
||||
assert_array_almost_equal(coef, coef_path_[:, -1])
|
||||
assert alpha_ == alphas_[-1]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"classifier", [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]
|
||||
)
|
||||
def test_lars_precompute(classifier):
|
||||
# Check for different values of precompute
|
||||
G = np.dot(X.T, X)
|
||||
|
||||
clf = classifier(precompute=G)
|
||||
output_1 = ignore_warnings(clf.fit)(X, y).coef_
|
||||
for precompute in [True, False, "auto", None]:
|
||||
clf = classifier(precompute=precompute)
|
||||
output_2 = clf.fit(X, y).coef_
|
||||
assert_array_almost_equal(output_1, output_2, decimal=8)
|
||||
|
||||
|
||||
def test_singular_matrix():
|
||||
# Test when input is a singular matrix
|
||||
X1 = np.array([[1, 1.0], [1.0, 1.0]])
|
||||
y1 = np.array([1, 1])
|
||||
_, _, coef_path = linear_model.lars_path(X1, y1)
|
||||
assert_array_almost_equal(coef_path.T, [[0, 0], [1, 0]])
|
||||
|
||||
|
||||
def test_rank_deficient_design():
|
||||
# consistency test that checks that LARS Lasso is handling rank
|
||||
# deficient input data (with n_features < rank) in the same way
|
||||
# as coordinate descent Lasso
|
||||
y = [5, 0, 5]
|
||||
for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]]):
|
||||
# To be able to use the coefs to compute the objective function,
|
||||
# we need to turn off normalization
|
||||
lars = linear_model.LassoLars(0.1)
|
||||
coef_lars_ = lars.fit(X, y).coef_
|
||||
obj_lars = 1.0 / (2.0 * 3.0) * linalg.norm(
|
||||
y - np.dot(X, coef_lars_)
|
||||
) ** 2 + 0.1 * linalg.norm(coef_lars_, 1)
|
||||
coord_descent = linear_model.Lasso(0.1, tol=1e-6)
|
||||
coef_cd_ = coord_descent.fit(X, y).coef_
|
||||
obj_cd = (1.0 / (2.0 * 3.0)) * linalg.norm(
|
||||
y - np.dot(X, coef_cd_)
|
||||
) ** 2 + 0.1 * linalg.norm(coef_cd_, 1)
|
||||
assert obj_lars < obj_cd * (1.0 + 1e-8)
|
||||
|
||||
|
||||
def test_lasso_lars_vs_lasso_cd():
|
||||
# Test that LassoLars and Lasso using coordinate descent give the
|
||||
# same results.
|
||||
X = 3 * diabetes.data
|
||||
|
||||
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
|
||||
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
|
||||
for c, a in zip(lasso_path.T, alphas):
|
||||
if a == 0:
|
||||
continue
|
||||
lasso_cd.alpha = a
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(c - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
# similar test, with the classifiers
|
||||
for alpha in np.linspace(1e-2, 1 - 1e-2, 20):
|
||||
clf1 = linear_model.LassoLars(alpha=alpha).fit(X, y)
|
||||
clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8).fit(X, y)
|
||||
err = linalg.norm(clf1.coef_ - clf2.coef_)
|
||||
assert err < 1e-3
|
||||
|
||||
# same test, with normalized data
|
||||
X = diabetes.data
|
||||
X = X - X.sum(axis=0)
|
||||
X /= np.linalg.norm(X, axis=0)
|
||||
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso")
|
||||
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
|
||||
for c, a in zip(lasso_path.T, alphas):
|
||||
if a == 0:
|
||||
continue
|
||||
lasso_cd.alpha = a
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(c - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
|
||||
def test_lasso_lars_vs_lasso_cd_early_stopping():
|
||||
# Test that LassoLars and Lasso using coordinate descent give the
|
||||
# same results when early stopping is used.
|
||||
# (test : before, in the middle, and in the last part of the path)
|
||||
alphas_min = [10, 0.9, 1e-4]
|
||||
|
||||
X = diabetes.data
|
||||
|
||||
for alpha_min in alphas_min:
|
||||
alphas, _, lasso_path = linear_model.lars_path(
|
||||
X, y, method="lasso", alpha_min=alpha_min
|
||||
)
|
||||
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8)
|
||||
lasso_cd.alpha = alphas[-1]
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
# same test, with normalization
|
||||
X = diabetes.data - diabetes.data.sum(axis=0)
|
||||
X /= np.linalg.norm(X, axis=0)
|
||||
|
||||
for alpha_min in alphas_min:
|
||||
alphas, _, lasso_path = linear_model.lars_path(
|
||||
X, y, method="lasso", alpha_min=alpha_min
|
||||
)
|
||||
lasso_cd = linear_model.Lasso(tol=1e-8)
|
||||
lasso_cd.alpha = alphas[-1]
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
|
||||
def test_lasso_lars_path_length():
|
||||
# Test that the path length of the LassoLars is right
|
||||
lasso = linear_model.LassoLars()
|
||||
lasso.fit(X, y)
|
||||
lasso2 = linear_model.LassoLars(alpha=lasso.alphas_[2])
|
||||
lasso2.fit(X, y)
|
||||
assert_array_almost_equal(lasso.alphas_[:3], lasso2.alphas_)
|
||||
# Also check that the sequence of alphas is always decreasing
|
||||
assert np.all(np.diff(lasso.alphas_) < 0)
|
||||
|
||||
|
||||
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
|
||||
# Test lasso lars on a very ill-conditioned design, and check that
|
||||
# it does not blow up, and stays somewhat close to a solution given
|
||||
# by the coordinate descent solver
|
||||
# Also test that lasso_path (using lars_path output style) gives
|
||||
# the same result as lars_path and previous lasso output style
|
||||
# under these conditions.
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# Generate data
|
||||
n, m = 70, 100
|
||||
k = 5
|
||||
X = rng.randn(n, m)
|
||||
w = np.zeros((m, 1))
|
||||
i = np.arange(0, m)
|
||||
rng.shuffle(i)
|
||||
supp = i[:k]
|
||||
w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
|
||||
y = np.dot(X, w)
|
||||
sigma = 0.2
|
||||
y += sigma * rng.rand(*y.shape)
|
||||
y = y.squeeze()
|
||||
lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method="lasso")
|
||||
|
||||
_, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6)
|
||||
|
||||
assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
|
||||
|
||||
|
||||
def test_lasso_lars_vs_lasso_cd_ill_conditioned2():
|
||||
# Create an ill-conditioned situation in which the LARS has to go
|
||||
# far in the path to converge, and check that LARS and coordinate
|
||||
# descent give the same answers
|
||||
# Note it used to be the case that Lars had to use the drop for good
|
||||
# strategy for this but this is no longer the case with the
|
||||
# equality_tolerance checks
|
||||
X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]]
|
||||
y = [10, 10, 1]
|
||||
alpha = 0.0001
|
||||
|
||||
def objective_function(coef):
|
||||
return 1.0 / (2.0 * len(X)) * linalg.norm(
|
||||
y - np.dot(X, coef)
|
||||
) ** 2 + alpha * linalg.norm(coef, 1)
|
||||
|
||||
lars = linear_model.LassoLars(alpha=alpha)
|
||||
warning_message = "Regressors in active set degenerate."
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
lars.fit(X, y)
|
||||
lars_coef_ = lars.coef_
|
||||
lars_obj = objective_function(lars_coef_)
|
||||
|
||||
coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4)
|
||||
cd_coef_ = coord_descent.fit(X, y).coef_
|
||||
cd_obj = objective_function(cd_coef_)
|
||||
|
||||
assert lars_obj < cd_obj * (1.0 + 1e-8)
|
||||
|
||||
|
||||
def test_lars_add_features():
|
||||
# assure that at least some features get added if necessary
|
||||
# test for 6d2b4c
|
||||
# Hilbert matrix
|
||||
n = 5
|
||||
H = 1.0 / (np.arange(1, n + 1) + np.arange(n)[:, np.newaxis])
|
||||
clf = linear_model.Lars(fit_intercept=False).fit(H, np.arange(n))
|
||||
assert np.all(np.isfinite(clf.coef_))
|
||||
|
||||
|
||||
def test_lars_n_nonzero_coefs(verbose=False):
|
||||
lars = linear_model.Lars(n_nonzero_coefs=6, verbose=verbose)
|
||||
lars.fit(X, y)
|
||||
assert len(lars.coef_.nonzero()[0]) == 6
|
||||
# The path should be of length 6 + 1 in a Lars going down to 6
|
||||
# non-zero coefs
|
||||
assert len(lars.alphas_) == 7
|
||||
|
||||
|
||||
@ignore_warnings
|
||||
def test_multitarget():
|
||||
# Assure that estimators receiving multidimensional y do the right thing
|
||||
Y = np.vstack([y, y**2]).T
|
||||
n_targets = Y.shape[1]
|
||||
estimators = [
|
||||
linear_model.LassoLars(),
|
||||
linear_model.Lars(),
|
||||
# regression test for gh-1615
|
||||
linear_model.LassoLars(fit_intercept=False),
|
||||
linear_model.Lars(fit_intercept=False),
|
||||
]
|
||||
|
||||
for estimator in estimators:
|
||||
estimator.fit(X, Y)
|
||||
Y_pred = estimator.predict(X)
|
||||
alphas, active, coef, path = (
|
||||
estimator.alphas_,
|
||||
estimator.active_,
|
||||
estimator.coef_,
|
||||
estimator.coef_path_,
|
||||
)
|
||||
for k in range(n_targets):
|
||||
estimator.fit(X, Y[:, k])
|
||||
y_pred = estimator.predict(X)
|
||||
assert_array_almost_equal(alphas[k], estimator.alphas_)
|
||||
assert_array_almost_equal(active[k], estimator.active_)
|
||||
assert_array_almost_equal(coef[k], estimator.coef_)
|
||||
assert_array_almost_equal(path[k], estimator.coef_path_)
|
||||
assert_array_almost_equal(Y_pred[:, k], y_pred)
|
||||
|
||||
|
||||
def test_lars_cv():
|
||||
# Test the LassoLarsCV object by checking that the optimal alpha
|
||||
# increases as the number of samples increases.
|
||||
# This property is not actually guaranteed in general and is just a
|
||||
# property of the given dataset, with the given steps chosen.
|
||||
old_alpha = 0
|
||||
lars_cv = linear_model.LassoLarsCV()
|
||||
for length in (400, 200, 100):
|
||||
X = diabetes.data[:length]
|
||||
y = diabetes.target[:length]
|
||||
lars_cv.fit(X, y)
|
||||
np.testing.assert_array_less(old_alpha, lars_cv.alpha_)
|
||||
old_alpha = lars_cv.alpha_
|
||||
assert not hasattr(lars_cv, "n_nonzero_coefs")
|
||||
|
||||
|
||||
def test_lars_cv_max_iter(recwarn):
|
||||
warnings.simplefilter("always")
|
||||
with np.errstate(divide="raise", invalid="raise"):
|
||||
X = diabetes.data
|
||||
y = diabetes.target
|
||||
rng = np.random.RandomState(42)
|
||||
x = rng.randn(len(y))
|
||||
X = diabetes.data
|
||||
X = np.c_[X, x, x] # add correlated features
|
||||
X = StandardScaler().fit_transform(X)
|
||||
lars_cv = linear_model.LassoLarsCV(max_iter=5, cv=5)
|
||||
lars_cv.fit(X, y)
|
||||
|
||||
# Check that there is no warning in general and no ConvergenceWarning
|
||||
# in particular.
|
||||
# Materialize the string representation of the warning to get a more
|
||||
# informative error message in case of AssertionError.
|
||||
recorded_warnings = [str(w) for w in recwarn]
|
||||
assert len(recorded_warnings) == 0
|
||||
|
||||
|
||||
def test_lasso_lars_ic():
|
||||
# Test the LassoLarsIC object by checking that
|
||||
# - some good features are selected.
|
||||
# - alpha_bic > alpha_aic
|
||||
# - n_nonzero_bic < n_nonzero_aic
|
||||
lars_bic = linear_model.LassoLarsIC("bic")
|
||||
lars_aic = linear_model.LassoLarsIC("aic")
|
||||
rng = np.random.RandomState(42)
|
||||
X = diabetes.data
|
||||
X = np.c_[X, rng.randn(X.shape[0], 5)] # add 5 bad features
|
||||
X = StandardScaler().fit_transform(X)
|
||||
lars_bic.fit(X, y)
|
||||
lars_aic.fit(X, y)
|
||||
nonzero_bic = np.where(lars_bic.coef_)[0]
|
||||
nonzero_aic = np.where(lars_aic.coef_)[0]
|
||||
assert lars_bic.alpha_ > lars_aic.alpha_
|
||||
assert len(nonzero_bic) < len(nonzero_aic)
|
||||
assert np.max(nonzero_bic) < diabetes.data.shape[1]
|
||||
|
||||
|
||||
def test_lars_path_readonly_data():
|
||||
# When using automated memory mapping on large input, the
|
||||
# fold data is in read-only mode
|
||||
# This is a non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/4597
|
||||
splitted_data = train_test_split(X, y, random_state=42)
|
||||
with TempMemmap(splitted_data) as (X_train, X_test, y_train, y_test):
|
||||
# The following should not fail despite copy=False
|
||||
_lars_path_residues(X_train, y_train, X_test, y_test, copy=False)
|
||||
|
||||
|
||||
def test_lars_path_positive_constraint():
|
||||
# this is the main test for the positive parameter on the lars_path method
|
||||
# the estimator classes just make use of this function
|
||||
|
||||
# we do the test on the diabetes dataset
|
||||
|
||||
# ensure that we get negative coefficients when positive=False
|
||||
# and all positive when positive=True
|
||||
# for method 'lar' (default) and lasso
|
||||
|
||||
err_msg = "Positive constraint not supported for 'lar' coding method."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
linear_model.lars_path(
|
||||
diabetes["data"], diabetes["target"], method="lar", positive=True
|
||||
)
|
||||
|
||||
method = "lasso"
|
||||
_, _, coefs = linear_model.lars_path(
|
||||
X, y, return_path=True, method=method, positive=False
|
||||
)
|
||||
assert coefs.min() < 0
|
||||
|
||||
_, _, coefs = linear_model.lars_path(
|
||||
X, y, return_path=True, method=method, positive=True
|
||||
)
|
||||
assert coefs.min() >= 0
|
||||
|
||||
|
||||
# now we gonna test the positive option for all estimator classes
|
||||
|
||||
default_parameter = {"fit_intercept": False}
|
||||
|
||||
estimator_parameter_map = {
|
||||
"LassoLars": {"alpha": 0.1},
|
||||
"LassoLarsCV": {},
|
||||
"LassoLarsIC": {},
|
||||
}
|
||||
|
||||
|
||||
def test_estimatorclasses_positive_constraint():
|
||||
# testing the transmissibility for the positive option of all estimator
|
||||
# classes in this same function here
|
||||
default_parameter = {"fit_intercept": False}
|
||||
|
||||
estimator_parameter_map = {
|
||||
"LassoLars": {"alpha": 0.1},
|
||||
"LassoLarsCV": {},
|
||||
"LassoLarsIC": {},
|
||||
}
|
||||
for estname in estimator_parameter_map:
|
||||
params = default_parameter.copy()
|
||||
params.update(estimator_parameter_map[estname])
|
||||
estimator = getattr(linear_model, estname)(positive=False, **params)
|
||||
estimator.fit(X, y)
|
||||
assert estimator.coef_.min() < 0
|
||||
estimator = getattr(linear_model, estname)(positive=True, **params)
|
||||
estimator.fit(X, y)
|
||||
assert min(estimator.coef_) >= 0
|
||||
|
||||
|
||||
def test_lasso_lars_vs_lasso_cd_positive():
|
||||
# Test that LassoLars and Lasso using coordinate descent give the
|
||||
# same results when using the positive option
|
||||
|
||||
# This test is basically a copy of the above with additional positive
|
||||
# option. However for the middle part, the comparison of coefficient values
|
||||
# for a range of alphas, we had to make an adaptations. See below.
|
||||
|
||||
# not normalized data
|
||||
X = 3 * diabetes.data
|
||||
|
||||
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
|
||||
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
|
||||
for c, a in zip(lasso_path.T, alphas):
|
||||
if a == 0:
|
||||
continue
|
||||
lasso_cd.alpha = a
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(c - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
# The range of alphas chosen for coefficient comparison here is restricted
|
||||
# as compared with the above test without the positive option. This is due
|
||||
# to the circumstance that the Lars-Lasso algorithm does not converge to
|
||||
# the least-squares-solution for small alphas, see 'Least Angle Regression'
|
||||
# by Efron et al 2004. The coefficients are typically in congruence up to
|
||||
# the smallest alpha reached by the Lars-Lasso algorithm and start to
|
||||
# diverge thereafter. See
|
||||
# https://gist.github.com/michigraber/7e7d7c75eca694c7a6ff
|
||||
|
||||
for alpha in np.linspace(6e-1, 1 - 1e-2, 20):
|
||||
clf1 = linear_model.LassoLars(
|
||||
fit_intercept=False, alpha=alpha, positive=True
|
||||
).fit(X, y)
|
||||
clf2 = linear_model.Lasso(
|
||||
fit_intercept=False, alpha=alpha, tol=1e-8, positive=True
|
||||
).fit(X, y)
|
||||
err = linalg.norm(clf1.coef_ - clf2.coef_)
|
||||
assert err < 1e-3
|
||||
|
||||
# normalized data
|
||||
X = diabetes.data - diabetes.data.sum(axis=0)
|
||||
X /= np.linalg.norm(X, axis=0)
|
||||
alphas, _, lasso_path = linear_model.lars_path(X, y, method="lasso", positive=True)
|
||||
lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8, positive=True)
|
||||
for c, a in zip(lasso_path.T[:-1], alphas[:-1]): # don't include alpha=0
|
||||
lasso_cd.alpha = a
|
||||
lasso_cd.fit(X, y)
|
||||
error = linalg.norm(c - lasso_cd.coef_)
|
||||
assert error < 0.01
|
||||
|
||||
|
||||
def test_lasso_lars_vs_R_implementation():
|
||||
# Test that sklearn LassoLars implementation agrees with the LassoLars
|
||||
# implementation available in R (lars library) when fit_intercept=False.
|
||||
|
||||
# Let's generate the data used in the bug report 7778
|
||||
y = np.array([-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366])
|
||||
x = np.array(
|
||||
[
|
||||
[0.47299829, 0, 0, 0, 0],
|
||||
[0.08239882, 0.85784863, 0, 0, 0],
|
||||
[0.30114139, -0.07501577, 0.80895216, 0, 0],
|
||||
[-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0],
|
||||
[-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291],
|
||||
]
|
||||
)
|
||||
|
||||
X = x.T
|
||||
|
||||
# The R result was obtained using the following code:
|
||||
#
|
||||
# library(lars)
|
||||
# model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE,
|
||||
# trace=TRUE, normalize=FALSE)
|
||||
# r = t(model_lasso_lars$beta)
|
||||
#
|
||||
|
||||
r = np.array(
|
||||
[
|
||||
[
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
-79.810362809499026,
|
||||
-83.528788732782829,
|
||||
-83.777653739190711,
|
||||
-83.784156932888934,
|
||||
-84.033390591756657,
|
||||
],
|
||||
[0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936],
|
||||
[
|
||||
0,
|
||||
-3.577397088285891,
|
||||
-4.702795355871871,
|
||||
-7.016748621359461,
|
||||
-7.614898471899412,
|
||||
-0.336938391359179,
|
||||
0,
|
||||
0,
|
||||
0.001213370600853,
|
||||
0.048162321585148,
|
||||
],
|
||||
[
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
2.231558436628169,
|
||||
2.723267514525966,
|
||||
2.811549786389614,
|
||||
2.813766976061531,
|
||||
2.817462468949557,
|
||||
2.817368178703816,
|
||||
2.816221090636795,
|
||||
],
|
||||
[
|
||||
0,
|
||||
0,
|
||||
-1.218422599914637,
|
||||
-3.457726183014808,
|
||||
-4.021304522060710,
|
||||
-45.827461592423745,
|
||||
-47.776608869312305,
|
||||
-47.911561610746404,
|
||||
-47.914845922736234,
|
||||
-48.039562334265717,
|
||||
],
|
||||
]
|
||||
)
|
||||
|
||||
model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False)
|
||||
model_lasso_lars.fit(X, y)
|
||||
skl_betas = model_lasso_lars.coef_path_
|
||||
|
||||
assert_array_almost_equal(r, skl_betas, decimal=12)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("copy_X", [True, False])
|
||||
def test_lasso_lars_copyX_behaviour(copy_X):
|
||||
"""
|
||||
Test that user input regarding copy_X is not being overridden (it was until
|
||||
at least version 0.21)
|
||||
|
||||
"""
|
||||
lasso_lars = LassoLarsIC(copy_X=copy_X, precompute=False)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.normal(0, 1, (100, 5))
|
||||
X_copy = X.copy()
|
||||
y = X[:, 2]
|
||||
lasso_lars.fit(X, y)
|
||||
assert copy_X == np.array_equal(X, X_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("copy_X", [True, False])
|
||||
def test_lasso_lars_fit_copyX_behaviour(copy_X):
|
||||
"""
|
||||
Test that user input to .fit for copy_X overrides default __init__ value
|
||||
|
||||
"""
|
||||
lasso_lars = LassoLarsIC(precompute=False)
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.normal(0, 1, (100, 5))
|
||||
X_copy = X.copy()
|
||||
y = X[:, 2]
|
||||
lasso_lars.fit(X, y, copy_X=copy_X)
|
||||
assert copy_X == np.array_equal(X, X_copy)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", (LassoLars(alpha=1e-3), Lars()))
|
||||
def test_lars_with_jitter(est):
|
||||
# Test that a small amount of jitter helps stability,
|
||||
# using example provided in issue #2746
|
||||
|
||||
X = np.array([[0.0, 0.0, 0.0, -1.0, 0.0], [0.0, -1.0, 0.0, 0.0, 0.0]])
|
||||
y = [-2.5, -2.5]
|
||||
expected_coef = [0, 2.5, 0, 2.5, 0]
|
||||
|
||||
# set to fit_intercept to False since target is constant and we want check
|
||||
# the value of coef. coef would be all zeros otherwise.
|
||||
est.set_params(fit_intercept=False)
|
||||
est_jitter = clone(est).set_params(jitter=10e-8, random_state=0)
|
||||
|
||||
est.fit(X, y)
|
||||
est_jitter.fit(X, y)
|
||||
|
||||
assert np.mean((est.coef_ - est_jitter.coef_) ** 2) > 0.1
|
||||
np.testing.assert_allclose(est_jitter.coef_, expected_coef, rtol=1e-3)
|
||||
|
||||
|
||||
def test_X_none_gram_not_none():
|
||||
with pytest.raises(ValueError, match="X cannot be None if Gram is not None"):
|
||||
lars_path(X=None, y=np.array([1]), Gram=True)
|
||||
|
||||
|
||||
def test_copy_X_with_auto_gram():
|
||||
# Non-regression test for #17789, `copy_X=True` and Gram='auto' does not
|
||||
# overwrite X
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.rand(6, 6)
|
||||
y = rng.rand(6)
|
||||
|
||||
X_before = X.copy()
|
||||
linear_model.lars_path(X, y, Gram="auto", copy_X=True, method="lasso")
|
||||
# X did not change
|
||||
assert_allclose(X, X_before)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"LARS, has_coef_path, args",
|
||||
(
|
||||
(Lars, True, {}),
|
||||
(LassoLars, True, {}),
|
||||
(LassoLarsIC, False, {}),
|
||||
(LarsCV, True, {}),
|
||||
# max_iter=5 is for avoiding ConvergenceWarning
|
||||
(LassoLarsCV, True, {"max_iter": 5}),
|
||||
),
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
|
||||
# The test ensures that the fit method preserves input dtype
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(20, 6).astype(dtype)
|
||||
y = rng.rand(20).astype(dtype)
|
||||
|
||||
model = LARS(**args)
|
||||
model.fit(X, y)
|
||||
assert model.coef_.dtype == dtype
|
||||
if has_coef_path:
|
||||
assert model.coef_path_.dtype == dtype
|
||||
assert model.intercept_.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"LARS, has_coef_path, args",
|
||||
(
|
||||
(Lars, True, {}),
|
||||
(LassoLars, True, {}),
|
||||
(LassoLarsIC, False, {}),
|
||||
(LarsCV, True, {}),
|
||||
# max_iter=5 is for avoiding ConvergenceWarning
|
||||
(LassoLarsCV, True, {"max_iter": 5}),
|
||||
),
|
||||
)
|
||||
def test_lars_numeric_consistency(LARS, has_coef_path, args):
|
||||
# The test ensures numerical consistency between trained coefficients
|
||||
# of float32 and float64.
|
||||
rtol = 1e-5
|
||||
atol = 1e-5
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X_64 = rng.rand(10, 6)
|
||||
y_64 = rng.rand(10)
|
||||
|
||||
model_64 = LARS(**args).fit(X_64, y_64)
|
||||
model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32))
|
||||
|
||||
assert_allclose(model_64.coef_, model_32.coef_, rtol=rtol, atol=atol)
|
||||
if has_coef_path:
|
||||
assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol)
|
||||
assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("criterion", ["aic", "bic"])
|
||||
def test_lassolarsic_alpha_selection(criterion):
|
||||
"""Check that we properly compute the AIC and BIC score.
|
||||
|
||||
In this test, we reproduce the example of the Fig. 2 of Zou et al.
|
||||
(reference [1] in LassoLarsIC) In this example, only 7 features should be
|
||||
selected.
|
||||
"""
|
||||
model = make_pipeline(StandardScaler(), LassoLarsIC(criterion=criterion))
|
||||
model.fit(X, y)
|
||||
|
||||
best_alpha_selected = np.argmin(model[-1].criterion_)
|
||||
assert best_alpha_selected == 7
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
def test_lassolarsic_noise_variance(fit_intercept):
|
||||
"""Check the behaviour when `n_samples` < `n_features` and that one needs
|
||||
to provide the noise variance."""
|
||||
rng = np.random.RandomState(0)
|
||||
X, y = datasets.make_regression(
|
||||
n_samples=10, n_features=11 - fit_intercept, random_state=rng
|
||||
)
|
||||
|
||||
model = make_pipeline(StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept))
|
||||
|
||||
err_msg = (
|
||||
"You are using LassoLarsIC in the case where the number of samples is smaller"
|
||||
" than the number of features"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
model.fit(X, y)
|
||||
|
||||
model.set_params(lassolarsic__noise_variance=1.0)
|
||||
model.fit(X, y).predict(X)
|
||||
@@ -0,0 +1,357 @@
|
||||
"""
|
||||
Tests for LinearModelLoss
|
||||
|
||||
Note that correctness of losses (which compose LinearModelLoss) is already well
|
||||
covered in the _loss module.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
from scipy import linalg, optimize
|
||||
|
||||
from sklearn._loss.loss import (
|
||||
HalfBinomialLoss,
|
||||
HalfMultinomialLoss,
|
||||
HalfPoissonLoss,
|
||||
)
|
||||
from sklearn.datasets import make_low_rank_matrix
|
||||
from sklearn.linear_model._linear_loss import LinearModelLoss
|
||||
from sklearn.utils.extmath import squared_norm
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
# We do not need to test all losses, just what LinearModelLoss does on top of the
|
||||
# base losses.
|
||||
LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
|
||||
|
||||
|
||||
def random_X_y_coef(
|
||||
linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
|
||||
):
|
||||
"""Random generate y, X and coef in valid range."""
|
||||
rng = np.random.RandomState(seed)
|
||||
n_dof = n_features + linear_model_loss.fit_intercept
|
||||
X = make_low_rank_matrix(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
random_state=rng,
|
||||
)
|
||||
coef = linear_model_loss.init_zero_coef(X)
|
||||
|
||||
if linear_model_loss.base_loss.is_multiclass:
|
||||
n_classes = linear_model_loss.base_loss.n_classes
|
||||
coef.flat[:] = rng.uniform(
|
||||
low=coef_bound[0],
|
||||
high=coef_bound[1],
|
||||
size=n_classes * n_dof,
|
||||
)
|
||||
if linear_model_loss.fit_intercept:
|
||||
raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
|
||||
else:
|
||||
raw_prediction = X @ coef.T
|
||||
proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
|
||||
|
||||
# y = rng.choice(np.arange(n_classes), p=proba) does not work.
|
||||
# See https://stackoverflow.com/a/34190035/16761084
|
||||
def choice_vectorized(items, p):
|
||||
s = p.cumsum(axis=1)
|
||||
r = rng.rand(p.shape[0])[:, None]
|
||||
k = (s < r).sum(axis=1)
|
||||
return items[k]
|
||||
|
||||
y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
|
||||
else:
|
||||
coef.flat[:] = rng.uniform(
|
||||
low=coef_bound[0],
|
||||
high=coef_bound[1],
|
||||
size=n_dof,
|
||||
)
|
||||
if linear_model_loss.fit_intercept:
|
||||
raw_prediction = X @ coef[:-1] + coef[-1]
|
||||
else:
|
||||
raw_prediction = X @ coef
|
||||
y = linear_model_loss.base_loss.link.inverse(
|
||||
raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
|
||||
)
|
||||
|
||||
return X, y, coef
|
||||
|
||||
|
||||
@pytest.mark.parametrize("base_loss", LOSSES)
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
@pytest.mark.parametrize("n_features", [0, 1, 10])
|
||||
@pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
|
||||
def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
|
||||
"""Test that init_zero_coef initializes coef correctly."""
|
||||
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.normal(size=(5, n_features))
|
||||
coef = loss.init_zero_coef(X, dtype=dtype)
|
||||
if loss.base_loss.is_multiclass:
|
||||
n_classes = loss.base_loss.n_classes
|
||||
assert coef.shape == (n_classes, n_features + fit_intercept)
|
||||
assert coef.flags["F_CONTIGUOUS"]
|
||||
else:
|
||||
assert coef.shape == (n_features + fit_intercept,)
|
||||
|
||||
if dtype is None:
|
||||
assert coef.dtype == X.dtype
|
||||
else:
|
||||
assert coef.dtype == dtype
|
||||
|
||||
assert np.count_nonzero(coef) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("base_loss", LOSSES)
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
@pytest.mark.parametrize("sample_weight", [None, "range"])
|
||||
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_loss_grad_hess_are_the_same(
|
||||
base_loss, fit_intercept, sample_weight, l2_reg_strength, csr_container
|
||||
):
|
||||
"""Test that loss and gradient are the same across different functions."""
|
||||
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
|
||||
X, y, coef = random_X_y_coef(
|
||||
linear_model_loss=loss, n_samples=10, n_features=5, seed=42
|
||||
)
|
||||
|
||||
if sample_weight == "range":
|
||||
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
|
||||
|
||||
l1 = loss.loss(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
g1 = loss.gradient(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
l2, g2 = loss.loss_gradient(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
g3, h3 = loss.gradient_hessian_product(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
if not base_loss.is_multiclass:
|
||||
g4, h4, _ = loss.gradient_hessian(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError):
|
||||
loss.gradient_hessian(
|
||||
coef,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
)
|
||||
|
||||
assert_allclose(l1, l2)
|
||||
assert_allclose(g1, g2)
|
||||
assert_allclose(g1, g3)
|
||||
if not base_loss.is_multiclass:
|
||||
assert_allclose(g1, g4)
|
||||
assert_allclose(h4 @ g4, h3(g3))
|
||||
|
||||
# same for sparse X
|
||||
X = csr_container(X)
|
||||
l1_sp = loss.loss(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
g1_sp = loss.gradient(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
l2_sp, g2_sp = loss.loss_gradient(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
g3_sp, h3_sp = loss.gradient_hessian_product(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
if not base_loss.is_multiclass:
|
||||
g4_sp, h4_sp, _ = loss.gradient_hessian(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
|
||||
assert_allclose(l1, l1_sp)
|
||||
assert_allclose(l1, l2_sp)
|
||||
assert_allclose(g1, g1_sp)
|
||||
assert_allclose(g1, g2_sp)
|
||||
assert_allclose(g1, g3_sp)
|
||||
assert_allclose(h3(g1), h3_sp(g1_sp))
|
||||
if not base_loss.is_multiclass:
|
||||
assert_allclose(g1, g4_sp)
|
||||
assert_allclose(h4 @ g4, h4_sp @ g1_sp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("base_loss", LOSSES)
|
||||
@pytest.mark.parametrize("sample_weight", [None, "range"])
|
||||
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
|
||||
@pytest.mark.parametrize("X_container", CSR_CONTAINERS + [None])
|
||||
def test_loss_gradients_hessp_intercept(
|
||||
base_loss, sample_weight, l2_reg_strength, X_container
|
||||
):
|
||||
"""Test that loss and gradient handle intercept correctly."""
|
||||
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
|
||||
loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
|
||||
n_samples, n_features = 10, 5
|
||||
X, y, coef = random_X_y_coef(
|
||||
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
|
||||
)
|
||||
|
||||
X[:, -1] = 1 # make last column of 1 to mimic intercept term
|
||||
X_inter = X[
|
||||
:, :-1
|
||||
] # exclude intercept column as it is added automatically by loss_inter
|
||||
|
||||
if X_container is not None:
|
||||
X = X_container(X)
|
||||
|
||||
if sample_weight == "range":
|
||||
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
|
||||
|
||||
l, g = loss.loss_gradient(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
_, hessp = loss.gradient_hessian_product(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
l_inter, g_inter = loss_inter.loss_gradient(
|
||||
coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
_, hessp_inter = loss_inter.gradient_hessian_product(
|
||||
coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
|
||||
# Note, that intercept gets no L2 penalty.
|
||||
assert l == pytest.approx(
|
||||
l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
|
||||
)
|
||||
|
||||
g_inter_corrected = g_inter
|
||||
g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
|
||||
assert_allclose(g, g_inter_corrected)
|
||||
|
||||
s = np.random.RandomState(42).randn(*coef.shape)
|
||||
h = hessp(s)
|
||||
h_inter = hessp_inter(s)
|
||||
h_inter_corrected = h_inter
|
||||
h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
|
||||
assert_allclose(h, h_inter_corrected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("base_loss", LOSSES)
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
@pytest.mark.parametrize("sample_weight", [None, "range"])
|
||||
@pytest.mark.parametrize("l2_reg_strength", [0, 1])
|
||||
def test_gradients_hessians_numerically(
|
||||
base_loss, fit_intercept, sample_weight, l2_reg_strength
|
||||
):
|
||||
"""Test gradients and hessians with numerical derivatives.
|
||||
|
||||
Gradient should equal the numerical derivatives of the loss function.
|
||||
Hessians should equal the numerical derivatives of gradients.
|
||||
"""
|
||||
loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
|
||||
n_samples, n_features = 10, 5
|
||||
X, y, coef = random_X_y_coef(
|
||||
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
|
||||
)
|
||||
coef = coef.ravel(order="F") # this is important only for multinomial loss
|
||||
|
||||
if sample_weight == "range":
|
||||
sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
|
||||
|
||||
# 1. Check gradients numerically
|
||||
eps = 1e-6
|
||||
g, hessp = loss.gradient_hessian_product(
|
||||
coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
|
||||
)
|
||||
# Use a trick to get central finite difference of accuracy 4 (five-point stencil)
|
||||
# https://en.wikipedia.org/wiki/Numerical_differentiation
|
||||
# https://en.wikipedia.org/wiki/Finite_difference_coefficient
|
||||
# approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
|
||||
approx_g1 = optimize.approx_fprime(
|
||||
coef,
|
||||
lambda coef: loss.loss(
|
||||
coef - eps,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
),
|
||||
2 * eps,
|
||||
)
|
||||
# approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
|
||||
approx_g2 = optimize.approx_fprime(
|
||||
coef,
|
||||
lambda coef: loss.loss(
|
||||
coef - 2 * eps,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
),
|
||||
4 * eps,
|
||||
)
|
||||
# Five-point stencil approximation
|
||||
# See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
|
||||
approx_g = (4 * approx_g1 - approx_g2) / 3
|
||||
assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
|
||||
|
||||
# 2. Check hessp numerically along the second direction of the gradient
|
||||
vector = np.zeros_like(g)
|
||||
vector[1] = 1
|
||||
hess_col = hessp(vector)
|
||||
# Computation of the Hessian is particularly fragile to numerical errors when doing
|
||||
# simple finite differences. Here we compute the grad along a path in the direction
|
||||
# of the vector and then use a least-square regression to estimate the slope
|
||||
eps = 1e-3
|
||||
d_x = np.linspace(-eps, eps, 30)
|
||||
d_grad = np.array(
|
||||
[
|
||||
loss.gradient(
|
||||
coef + t * vector,
|
||||
X,
|
||||
y,
|
||||
sample_weight=sample_weight,
|
||||
l2_reg_strength=l2_reg_strength,
|
||||
)
|
||||
for t in d_x
|
||||
]
|
||||
)
|
||||
d_grad -= d_grad.mean(axis=0)
|
||||
approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
|
||||
assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
def test_multinomial_coef_shape(fit_intercept):
|
||||
"""Test that multinomial LinearModelLoss respects shape of coef."""
|
||||
loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
|
||||
n_samples, n_features = 10, 5
|
||||
X, y, coef = random_X_y_coef(
|
||||
linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
|
||||
)
|
||||
s = np.random.RandomState(42).randn(*coef.shape)
|
||||
|
||||
l, g = loss.loss_gradient(coef, X, y)
|
||||
g1 = loss.gradient(coef, X, y)
|
||||
g2, hessp = loss.gradient_hessian_product(coef, X, y)
|
||||
h = hessp(s)
|
||||
assert g.shape == coef.shape
|
||||
assert h.shape == coef.shape
|
||||
assert_allclose(g, g1)
|
||||
assert_allclose(g, g2)
|
||||
|
||||
coef_r = coef.ravel(order="F")
|
||||
s_r = s.ravel(order="F")
|
||||
l_r, g_r = loss.loss_gradient(coef_r, X, y)
|
||||
g1_r = loss.gradient(coef_r, X, y)
|
||||
g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
|
||||
h_r = hessp_r(s_r)
|
||||
assert g_r.shape == coef_r.shape
|
||||
assert h_r.shape == coef_r.shape
|
||||
assert_allclose(g_r, g1_r)
|
||||
assert_allclose(g_r, g2_r)
|
||||
|
||||
assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
|
||||
assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,273 @@
|
||||
# Author: Vlad Niculae
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import make_sparse_coded_signal
|
||||
from sklearn.linear_model import (
|
||||
LinearRegression,
|
||||
OrthogonalMatchingPursuit,
|
||||
OrthogonalMatchingPursuitCV,
|
||||
orthogonal_mp,
|
||||
orthogonal_mp_gram,
|
||||
)
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
n_samples, n_features, n_nonzero_coefs, n_targets = 25, 35, 5, 3
|
||||
y, X, gamma = make_sparse_coded_signal(
|
||||
n_samples=n_targets,
|
||||
n_components=n_features,
|
||||
n_features=n_samples,
|
||||
n_nonzero_coefs=n_nonzero_coefs,
|
||||
random_state=0,
|
||||
)
|
||||
y, X, gamma = y.T, X.T, gamma.T
|
||||
# Make X not of norm 1 for testing
|
||||
X *= 10
|
||||
y *= 10
|
||||
G, Xy = np.dot(X.T, X), np.dot(X.T, y)
|
||||
# this makes X (n_samples, n_features)
|
||||
# and y (n_samples, 3)
|
||||
|
||||
|
||||
def test_correct_shapes():
|
||||
assert orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5).shape == (n_features,)
|
||||
assert orthogonal_mp(X, y, n_nonzero_coefs=5).shape == (n_features, 3)
|
||||
|
||||
|
||||
def test_correct_shapes_gram():
|
||||
assert orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5).shape == (n_features,)
|
||||
assert orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5).shape == (n_features, 3)
|
||||
|
||||
|
||||
def test_n_nonzero_coefs():
|
||||
assert np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)) <= 5
|
||||
assert (
|
||||
np.count_nonzero(orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5, precompute=True))
|
||||
<= 5
|
||||
)
|
||||
|
||||
|
||||
def test_tol():
|
||||
tol = 0.5
|
||||
gamma = orthogonal_mp(X, y[:, 0], tol=tol)
|
||||
gamma_gram = orthogonal_mp(X, y[:, 0], tol=tol, precompute=True)
|
||||
assert np.sum((y[:, 0] - np.dot(X, gamma)) ** 2) <= tol
|
||||
assert np.sum((y[:, 0] - np.dot(X, gamma_gram)) ** 2) <= tol
|
||||
|
||||
|
||||
def test_with_without_gram():
|
||||
assert_array_almost_equal(
|
||||
orthogonal_mp(X, y, n_nonzero_coefs=5),
|
||||
orthogonal_mp(X, y, n_nonzero_coefs=5, precompute=True),
|
||||
)
|
||||
|
||||
|
||||
def test_with_without_gram_tol():
|
||||
assert_array_almost_equal(
|
||||
orthogonal_mp(X, y, tol=1.0), orthogonal_mp(X, y, tol=1.0, precompute=True)
|
||||
)
|
||||
|
||||
|
||||
def test_unreachable_accuracy():
|
||||
assert_array_almost_equal(
|
||||
orthogonal_mp(X, y, tol=0), orthogonal_mp(X, y, n_nonzero_coefs=n_features)
|
||||
)
|
||||
warning_message = (
|
||||
"Orthogonal matching pursuit ended prematurely "
|
||||
"due to linear dependence in the dictionary. "
|
||||
"The requested precision might not have been met."
|
||||
)
|
||||
with pytest.warns(RuntimeWarning, match=warning_message):
|
||||
assert_array_almost_equal(
|
||||
orthogonal_mp(X, y, tol=0, precompute=True),
|
||||
orthogonal_mp(X, y, precompute=True, n_nonzero_coefs=n_features),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("positional_params", [(X, y), (G, Xy)])
|
||||
@pytest.mark.parametrize(
|
||||
"keyword_params",
|
||||
[{"n_nonzero_coefs": n_features + 1}],
|
||||
)
|
||||
def test_bad_input(positional_params, keyword_params):
|
||||
with pytest.raises(ValueError):
|
||||
orthogonal_mp(*positional_params, **keyword_params)
|
||||
|
||||
|
||||
def test_perfect_signal_recovery():
|
||||
(idx,) = gamma[:, 0].nonzero()
|
||||
gamma_rec = orthogonal_mp(X, y[:, 0], n_nonzero_coefs=5)
|
||||
gamma_gram = orthogonal_mp_gram(G, Xy[:, 0], n_nonzero_coefs=5)
|
||||
assert_array_equal(idx, np.flatnonzero(gamma_rec))
|
||||
assert_array_equal(idx, np.flatnonzero(gamma_gram))
|
||||
assert_array_almost_equal(gamma[:, 0], gamma_rec, decimal=2)
|
||||
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
|
||||
|
||||
|
||||
def test_orthogonal_mp_gram_readonly():
|
||||
# Non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/5956
|
||||
(idx,) = gamma[:, 0].nonzero()
|
||||
G_readonly = G.copy()
|
||||
G_readonly.setflags(write=False)
|
||||
Xy_readonly = Xy.copy()
|
||||
Xy_readonly.setflags(write=False)
|
||||
gamma_gram = orthogonal_mp_gram(
|
||||
G_readonly, Xy_readonly[:, 0], n_nonzero_coefs=5, copy_Gram=False, copy_Xy=False
|
||||
)
|
||||
assert_array_equal(idx, np.flatnonzero(gamma_gram))
|
||||
assert_array_almost_equal(gamma[:, 0], gamma_gram, decimal=2)
|
||||
|
||||
|
||||
def test_estimator():
|
||||
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
|
||||
omp.fit(X, y[:, 0])
|
||||
assert omp.coef_.shape == (n_features,)
|
||||
assert omp.intercept_.shape == ()
|
||||
assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
|
||||
|
||||
omp.fit(X, y)
|
||||
assert omp.coef_.shape == (n_targets, n_features)
|
||||
assert omp.intercept_.shape == (n_targets,)
|
||||
assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
|
||||
|
||||
coef_normalized = omp.coef_[0].copy()
|
||||
omp.set_params(fit_intercept=True)
|
||||
omp.fit(X, y[:, 0])
|
||||
assert_array_almost_equal(coef_normalized, omp.coef_)
|
||||
|
||||
omp.set_params(fit_intercept=False)
|
||||
omp.fit(X, y[:, 0])
|
||||
assert np.count_nonzero(omp.coef_) <= n_nonzero_coefs
|
||||
assert omp.coef_.shape == (n_features,)
|
||||
assert omp.intercept_ == 0
|
||||
|
||||
omp.fit(X, y)
|
||||
assert omp.coef_.shape == (n_targets, n_features)
|
||||
assert omp.intercept_ == 0
|
||||
assert np.count_nonzero(omp.coef_) <= n_targets * n_nonzero_coefs
|
||||
|
||||
|
||||
def test_estimator_n_nonzero_coefs():
|
||||
"""Check `n_nonzero_coefs_` correct when `tol` is and isn't set."""
|
||||
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs)
|
||||
omp.fit(X, y[:, 0])
|
||||
assert omp.n_nonzero_coefs_ == n_nonzero_coefs
|
||||
|
||||
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_nonzero_coefs, tol=0.5)
|
||||
omp.fit(X, y[:, 0])
|
||||
assert omp.n_nonzero_coefs_ is None
|
||||
|
||||
|
||||
def test_identical_regressors():
|
||||
newX = X.copy()
|
||||
newX[:, 1] = newX[:, 0]
|
||||
gamma = np.zeros(n_features)
|
||||
gamma[0] = gamma[1] = 1.0
|
||||
newy = np.dot(newX, gamma)
|
||||
warning_message = (
|
||||
"Orthogonal matching pursuit ended prematurely "
|
||||
"due to linear dependence in the dictionary. "
|
||||
"The requested precision might not have been met."
|
||||
)
|
||||
with pytest.warns(RuntimeWarning, match=warning_message):
|
||||
orthogonal_mp(newX, newy, n_nonzero_coefs=2)
|
||||
|
||||
|
||||
def test_swapped_regressors():
|
||||
gamma = np.zeros(n_features)
|
||||
# X[:, 21] should be selected first, then X[:, 0] selected second,
|
||||
# which will take X[:, 21]'s place in case the algorithm does
|
||||
# column swapping for optimization (which is the case at the moment)
|
||||
gamma[21] = 1.0
|
||||
gamma[0] = 0.5
|
||||
new_y = np.dot(X, gamma)
|
||||
new_Xy = np.dot(X.T, new_y)
|
||||
gamma_hat = orthogonal_mp(X, new_y, n_nonzero_coefs=2)
|
||||
gamma_hat_gram = orthogonal_mp_gram(G, new_Xy, n_nonzero_coefs=2)
|
||||
assert_array_equal(np.flatnonzero(gamma_hat), [0, 21])
|
||||
assert_array_equal(np.flatnonzero(gamma_hat_gram), [0, 21])
|
||||
|
||||
|
||||
def test_no_atoms():
|
||||
y_empty = np.zeros_like(y)
|
||||
Xy_empty = np.dot(X.T, y_empty)
|
||||
gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, n_nonzero_coefs=1)
|
||||
gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, n_nonzero_coefs=1)
|
||||
assert np.all(gamma_empty == 0)
|
||||
assert np.all(gamma_empty_gram == 0)
|
||||
|
||||
|
||||
def test_omp_path():
|
||||
path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True)
|
||||
last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False)
|
||||
assert path.shape == (n_features, n_targets, 5)
|
||||
assert_array_almost_equal(path[:, :, -1], last)
|
||||
path = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=True)
|
||||
last = orthogonal_mp_gram(G, Xy, n_nonzero_coefs=5, return_path=False)
|
||||
assert path.shape == (n_features, n_targets, 5)
|
||||
assert_array_almost_equal(path[:, :, -1], last)
|
||||
|
||||
|
||||
def test_omp_return_path_prop_with_gram():
|
||||
path = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=True, precompute=True)
|
||||
last = orthogonal_mp(X, y, n_nonzero_coefs=5, return_path=False, precompute=True)
|
||||
assert path.shape == (n_features, n_targets, 5)
|
||||
assert_array_almost_equal(path[:, :, -1], last)
|
||||
|
||||
|
||||
def test_omp_cv():
|
||||
y_ = y[:, 0]
|
||||
gamma_ = gamma[:, 0]
|
||||
ompcv = OrthogonalMatchingPursuitCV(fit_intercept=False, max_iter=10)
|
||||
ompcv.fit(X, y_)
|
||||
assert ompcv.n_nonzero_coefs_ == n_nonzero_coefs
|
||||
assert_array_almost_equal(ompcv.coef_, gamma_)
|
||||
omp = OrthogonalMatchingPursuit(
|
||||
fit_intercept=False, n_nonzero_coefs=ompcv.n_nonzero_coefs_
|
||||
)
|
||||
omp.fit(X, y_)
|
||||
assert_array_almost_equal(ompcv.coef_, omp.coef_)
|
||||
|
||||
|
||||
def test_omp_reaches_least_squares():
|
||||
# Use small simple data; it's a sanity check but OMP can stop early
|
||||
rng = check_random_state(0)
|
||||
n_samples, n_features = (10, 8)
|
||||
n_targets = 3
|
||||
X = rng.randn(n_samples, n_features)
|
||||
Y = rng.randn(n_samples, n_targets)
|
||||
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=n_features)
|
||||
lstsq = LinearRegression()
|
||||
omp.fit(X, Y)
|
||||
lstsq.fit(X, Y)
|
||||
assert_array_almost_equal(omp.coef_, lstsq.coef_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("data_type", (np.float32, np.float64))
|
||||
def test_omp_gram_dtype_match(data_type):
|
||||
# verify matching input data type and output data type
|
||||
coef = orthogonal_mp_gram(
|
||||
G.astype(data_type), Xy.astype(data_type), n_nonzero_coefs=5
|
||||
)
|
||||
assert coef.dtype == data_type
|
||||
|
||||
|
||||
def test_omp_gram_numerical_consistency():
|
||||
# verify numericaly consistency among np.float32 and np.float64
|
||||
coef_32 = orthogonal_mp_gram(
|
||||
G.astype(np.float32), Xy.astype(np.float32), n_nonzero_coefs=5
|
||||
)
|
||||
coef_64 = orthogonal_mp_gram(
|
||||
G.astype(np.float32), Xy.astype(np.float64), n_nonzero_coefs=5
|
||||
)
|
||||
assert_allclose(coef_32, coef_64)
|
||||
@@ -0,0 +1,278 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.base import ClassifierMixin
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.linear_model import PassiveAggressiveClassifier, PassiveAggressiveRegressor
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
iris = load_iris()
|
||||
random_state = check_random_state(12)
|
||||
indices = np.arange(iris.data.shape[0])
|
||||
random_state.shuffle(indices)
|
||||
X = iris.data[indices]
|
||||
y = iris.target[indices]
|
||||
|
||||
|
||||
class MyPassiveAggressive(ClassifierMixin):
|
||||
def __init__(
|
||||
self,
|
||||
C=1.0,
|
||||
epsilon=0.01,
|
||||
loss="hinge",
|
||||
fit_intercept=True,
|
||||
n_iter=1,
|
||||
random_state=None,
|
||||
):
|
||||
self.C = C
|
||||
self.epsilon = epsilon
|
||||
self.loss = loss
|
||||
self.fit_intercept = fit_intercept
|
||||
self.n_iter = n_iter
|
||||
|
||||
def fit(self, X, y):
|
||||
n_samples, n_features = X.shape
|
||||
self.w = np.zeros(n_features, dtype=np.float64)
|
||||
self.b = 0.0
|
||||
|
||||
for t in range(self.n_iter):
|
||||
for i in range(n_samples):
|
||||
p = self.project(X[i])
|
||||
if self.loss in ("hinge", "squared_hinge"):
|
||||
loss = max(1 - y[i] * p, 0)
|
||||
else:
|
||||
loss = max(np.abs(p - y[i]) - self.epsilon, 0)
|
||||
|
||||
sqnorm = np.dot(X[i], X[i])
|
||||
|
||||
if self.loss in ("hinge", "epsilon_insensitive"):
|
||||
step = min(self.C, loss / sqnorm)
|
||||
elif self.loss in ("squared_hinge", "squared_epsilon_insensitive"):
|
||||
step = loss / (sqnorm + 1.0 / (2 * self.C))
|
||||
|
||||
if self.loss in ("hinge", "squared_hinge"):
|
||||
step *= y[i]
|
||||
else:
|
||||
step *= np.sign(y[i] - p)
|
||||
|
||||
self.w += step * X[i]
|
||||
if self.fit_intercept:
|
||||
self.b += step
|
||||
|
||||
def project(self, X):
|
||||
return np.dot(X, self.w) + self.b
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
def test_classifier_accuracy(csr_container, fit_intercept, average):
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
clf = PassiveAggressiveClassifier(
|
||||
C=1.0,
|
||||
max_iter=30,
|
||||
fit_intercept=fit_intercept,
|
||||
random_state=1,
|
||||
average=average,
|
||||
tol=None,
|
||||
)
|
||||
clf.fit(data, y)
|
||||
score = clf.score(data, y)
|
||||
assert score > 0.79
|
||||
if average:
|
||||
assert hasattr(clf, "_average_coef")
|
||||
assert hasattr(clf, "_average_intercept")
|
||||
assert hasattr(clf, "_standard_intercept")
|
||||
assert hasattr(clf, "_standard_coef")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
def test_classifier_partial_fit(csr_container, average):
|
||||
classes = np.unique(y)
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5)
|
||||
for t in range(30):
|
||||
clf.partial_fit(data, y, classes)
|
||||
score = clf.score(data, y)
|
||||
assert score > 0.79
|
||||
if average:
|
||||
assert hasattr(clf, "_average_coef")
|
||||
assert hasattr(clf, "_average_intercept")
|
||||
assert hasattr(clf, "_standard_intercept")
|
||||
assert hasattr(clf, "_standard_coef")
|
||||
|
||||
|
||||
def test_classifier_refit():
|
||||
# Classifier can be retrained on different labels and features.
|
||||
clf = PassiveAggressiveClassifier(max_iter=5).fit(X, y)
|
||||
assert_array_equal(clf.classes_, np.unique(y))
|
||||
|
||||
clf.fit(X[:, :-1], iris.target_names[y])
|
||||
assert_array_equal(clf.classes_, iris.target_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
@pytest.mark.parametrize("loss", ("hinge", "squared_hinge"))
|
||||
def test_classifier_correctness(loss, csr_container):
|
||||
y_bin = y.copy()
|
||||
y_bin[y != 1] = -1
|
||||
|
||||
clf1 = MyPassiveAggressive(loss=loss, n_iter=2)
|
||||
clf1.fit(X, y_bin)
|
||||
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
clf2 = PassiveAggressiveClassifier(loss=loss, max_iter=2, shuffle=False, tol=None)
|
||||
clf2.fit(data, y_bin)
|
||||
|
||||
assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["predict_proba", "predict_log_proba", "transform"]
|
||||
)
|
||||
def test_classifier_undefined_methods(response_method):
|
||||
clf = PassiveAggressiveClassifier(max_iter=100)
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, response_method)
|
||||
|
||||
|
||||
def test_class_weights():
|
||||
# Test class weights.
|
||||
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
|
||||
y2 = [1, 1, 1, -1, -1]
|
||||
|
||||
clf = PassiveAggressiveClassifier(
|
||||
C=0.1, max_iter=100, class_weight=None, random_state=100
|
||||
)
|
||||
clf.fit(X2, y2)
|
||||
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1]))
|
||||
|
||||
# we give a small weights to class 1
|
||||
clf = PassiveAggressiveClassifier(
|
||||
C=0.1, max_iter=100, class_weight={1: 0.001}, random_state=100
|
||||
)
|
||||
clf.fit(X2, y2)
|
||||
|
||||
# now the hyperplane should rotate clock-wise and
|
||||
# the prediction on this point should shift
|
||||
assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1]))
|
||||
|
||||
|
||||
def test_partial_fit_weight_class_balanced():
|
||||
# partial_fit with class_weight='balanced' not supported
|
||||
clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100)
|
||||
with pytest.raises(ValueError):
|
||||
clf.partial_fit(X, y, classes=np.unique(y))
|
||||
|
||||
|
||||
def test_equal_class_weight():
|
||||
X2 = [[1, 0], [1, 0], [0, 1], [0, 1]]
|
||||
y2 = [0, 0, 1, 1]
|
||||
clf = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight=None)
|
||||
clf.fit(X2, y2)
|
||||
|
||||
# Already balanced, so "balanced" weights should have no effect
|
||||
clf_balanced = PassiveAggressiveClassifier(C=0.1, tol=None, class_weight="balanced")
|
||||
clf_balanced.fit(X2, y2)
|
||||
|
||||
clf_weighted = PassiveAggressiveClassifier(
|
||||
C=0.1, tol=None, class_weight={0: 0.5, 1: 0.5}
|
||||
)
|
||||
clf_weighted.fit(X2, y2)
|
||||
|
||||
# should be similar up to some epsilon due to learning rate schedule
|
||||
assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2)
|
||||
assert_almost_equal(clf.coef_, clf_balanced.coef_, decimal=2)
|
||||
|
||||
|
||||
def test_wrong_class_weight_label():
|
||||
# ValueError due to wrong class_weight label.
|
||||
X2 = np.array([[-1.0, -1.0], [-1.0, 0], [-0.8, -1.0], [1.0, 1.0], [1.0, 0.0]])
|
||||
y2 = [1, 1, 1, -1, -1]
|
||||
|
||||
clf = PassiveAggressiveClassifier(class_weight={0: 0.5}, max_iter=100)
|
||||
with pytest.raises(ValueError):
|
||||
clf.fit(X2, y2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
def test_regressor_mse(csr_container, fit_intercept, average):
|
||||
y_bin = y.copy()
|
||||
y_bin[y != 1] = -1
|
||||
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
reg = PassiveAggressiveRegressor(
|
||||
C=1.0,
|
||||
fit_intercept=fit_intercept,
|
||||
random_state=0,
|
||||
average=average,
|
||||
max_iter=5,
|
||||
)
|
||||
reg.fit(data, y_bin)
|
||||
pred = reg.predict(data)
|
||||
assert np.mean((pred - y_bin) ** 2) < 1.7
|
||||
if average:
|
||||
assert hasattr(reg, "_average_coef")
|
||||
assert hasattr(reg, "_average_intercept")
|
||||
assert hasattr(reg, "_standard_intercept")
|
||||
assert hasattr(reg, "_standard_coef")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("average", [False, True])
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
def test_regressor_partial_fit(csr_container, average):
|
||||
y_bin = y.copy()
|
||||
y_bin[y != 1] = -1
|
||||
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
reg = PassiveAggressiveRegressor(random_state=0, average=average, max_iter=100)
|
||||
for t in range(50):
|
||||
reg.partial_fit(data, y_bin)
|
||||
pred = reg.predict(data)
|
||||
assert np.mean((pred - y_bin) ** 2) < 1.7
|
||||
if average:
|
||||
assert hasattr(reg, "_average_coef")
|
||||
assert hasattr(reg, "_average_intercept")
|
||||
assert hasattr(reg, "_standard_intercept")
|
||||
assert hasattr(reg, "_standard_coef")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", [None, *CSR_CONTAINERS])
|
||||
@pytest.mark.parametrize("loss", ("epsilon_insensitive", "squared_epsilon_insensitive"))
|
||||
def test_regressor_correctness(loss, csr_container):
|
||||
y_bin = y.copy()
|
||||
y_bin[y != 1] = -1
|
||||
|
||||
reg1 = MyPassiveAggressive(loss=loss, n_iter=2)
|
||||
reg1.fit(X, y_bin)
|
||||
|
||||
data = csr_container(X) if csr_container is not None else X
|
||||
reg2 = PassiveAggressiveRegressor(tol=None, loss=loss, max_iter=2, shuffle=False)
|
||||
reg2.fit(data, y_bin)
|
||||
|
||||
assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2)
|
||||
|
||||
|
||||
def test_regressor_undefined_methods():
|
||||
reg = PassiveAggressiveRegressor(max_iter=100)
|
||||
with pytest.raises(AttributeError):
|
||||
reg.transform(X)
|
||||
|
||||
|
||||
# TODO(1.7): remove
|
||||
@pytest.mark.parametrize(
|
||||
"Estimator", [PassiveAggressiveClassifier, PassiveAggressiveRegressor]
|
||||
)
|
||||
def test_passive_aggressive_deprecated_average(Estimator):
|
||||
est = Estimator(average=0)
|
||||
with pytest.warns(FutureWarning, match="average=0"):
|
||||
est.fit(X, y)
|
||||
@@ -0,0 +1,88 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.linear_model import Perceptron
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_almost_equal
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
iris = load_iris()
|
||||
random_state = check_random_state(12)
|
||||
indices = np.arange(iris.data.shape[0])
|
||||
random_state.shuffle(indices)
|
||||
X = iris.data[indices]
|
||||
y = iris.target[indices]
|
||||
|
||||
|
||||
class MyPerceptron:
|
||||
def __init__(self, n_iter=1):
|
||||
self.n_iter = n_iter
|
||||
|
||||
def fit(self, X, y):
|
||||
n_samples, n_features = X.shape
|
||||
self.w = np.zeros(n_features, dtype=np.float64)
|
||||
self.b = 0.0
|
||||
|
||||
for t in range(self.n_iter):
|
||||
for i in range(n_samples):
|
||||
if self.predict(X[i])[0] != y[i]:
|
||||
self.w += y[i] * X[i]
|
||||
self.b += y[i]
|
||||
|
||||
def project(self, X):
|
||||
return np.dot(X, self.w) + self.b
|
||||
|
||||
def predict(self, X):
|
||||
X = np.atleast_2d(X)
|
||||
return np.sign(self.project(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("container", CSR_CONTAINERS + [np.array])
|
||||
def test_perceptron_accuracy(container):
|
||||
data = container(X)
|
||||
clf = Perceptron(max_iter=100, tol=None, shuffle=False)
|
||||
clf.fit(data, y)
|
||||
score = clf.score(data, y)
|
||||
assert score > 0.7
|
||||
|
||||
|
||||
def test_perceptron_correctness():
|
||||
y_bin = y.copy()
|
||||
y_bin[y != 1] = -1
|
||||
|
||||
clf1 = MyPerceptron(n_iter=2)
|
||||
clf1.fit(X, y_bin)
|
||||
|
||||
clf2 = Perceptron(max_iter=2, shuffle=False, tol=None)
|
||||
clf2.fit(X, y_bin)
|
||||
|
||||
assert_array_almost_equal(clf1.w, clf2.coef_.ravel())
|
||||
|
||||
|
||||
def test_undefined_methods():
|
||||
clf = Perceptron(max_iter=100)
|
||||
for meth in ("predict_proba", "predict_log_proba"):
|
||||
with pytest.raises(AttributeError):
|
||||
getattr(clf, meth)
|
||||
|
||||
|
||||
def test_perceptron_l1_ratio():
|
||||
"""Check that `l1_ratio` has an impact when `penalty='elasticnet'`"""
|
||||
clf1 = Perceptron(l1_ratio=0, penalty="elasticnet")
|
||||
clf1.fit(X, y)
|
||||
|
||||
clf2 = Perceptron(l1_ratio=0.15, penalty="elasticnet")
|
||||
clf2.fit(X, y)
|
||||
|
||||
assert clf1.score(X, y) != clf2.score(X, y)
|
||||
|
||||
# check that the bounds of elastic net which should correspond to an l1 or
|
||||
# l2 penalty depending of `l1_ratio` value.
|
||||
clf_l1 = Perceptron(penalty="l1").fit(X, y)
|
||||
clf_elasticnet = Perceptron(l1_ratio=1, penalty="elasticnet").fit(X, y)
|
||||
assert_allclose(clf_l1.coef_, clf_elasticnet.coef_)
|
||||
|
||||
clf_l2 = Perceptron(penalty="l2").fit(X, y)
|
||||
clf_elasticnet = Perceptron(l1_ratio=0, penalty="elasticnet").fit(X, y)
|
||||
assert_allclose(clf_l2.coef_, clf_elasticnet.coef_)
|
||||
@@ -0,0 +1,306 @@
|
||||
# Authors: David Dale <dale.david@mail.ru>
|
||||
# Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from pytest import approx
|
||||
from scipy.optimize import minimize
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import HuberRegressor, QuantileRegressor
|
||||
from sklearn.metrics import mean_pinball_loss
|
||||
from sklearn.utils._testing import assert_allclose, skip_if_32bit
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
parse_version,
|
||||
sp_version,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def X_y_data():
|
||||
X, y = make_regression(n_samples=10, n_features=1, random_state=0, noise=1)
|
||||
return X, y
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def default_solver():
|
||||
return "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
parse_version(sp_version.base_version) >= parse_version("1.11"),
|
||||
reason="interior-point solver is not available in SciPy 1.11",
|
||||
)
|
||||
@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_incompatible_solver_for_sparse_input(X_y_data, solver, csc_container):
|
||||
X, y = X_y_data
|
||||
X_sparse = csc_container(X)
|
||||
err_msg = (
|
||||
f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
QuantileRegressor(solver=solver).fit(X_sparse, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
|
||||
@pytest.mark.skipif(
|
||||
sp_version >= parse_version("1.6.0"),
|
||||
reason="Solvers are available as of scipy 1.6.0",
|
||||
)
|
||||
def test_too_new_solver_methods_raise_error(X_y_data, solver):
|
||||
"""Test that highs solver raises for scipy<1.6.0."""
|
||||
X, y = X_y_data
|
||||
with pytest.raises(ValueError, match="scipy>=1.6.0"):
|
||||
QuantileRegressor(solver=solver).fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"quantile, alpha, intercept, coef",
|
||||
[
|
||||
# for 50% quantile w/o regularization, any slope in [1, 10] is okay
|
||||
[0.5, 0, 1, None],
|
||||
# if positive error costs more, the slope is maximal
|
||||
[0.51, 0, 1, 10],
|
||||
# if negative error costs more, the slope is minimal
|
||||
[0.49, 0, 1, 1],
|
||||
# for a small lasso penalty, the slope is also minimal
|
||||
[0.5, 0.01, 1, 1],
|
||||
# for a large lasso penalty, the model predicts the constant median
|
||||
[0.5, 100, 2, 0],
|
||||
],
|
||||
)
|
||||
def test_quantile_toy_example(quantile, alpha, intercept, coef, default_solver):
|
||||
# test how different parameters affect a small intuitive example
|
||||
X = [[0], [1], [1]]
|
||||
y = [1, 2, 11]
|
||||
model = QuantileRegressor(
|
||||
quantile=quantile, alpha=alpha, solver=default_solver
|
||||
).fit(X, y)
|
||||
assert_allclose(model.intercept_, intercept, atol=1e-2)
|
||||
if coef is not None:
|
||||
assert_allclose(model.coef_[0], coef, atol=1e-2)
|
||||
if alpha < 100:
|
||||
assert model.coef_[0] >= 1
|
||||
assert model.coef_[0] <= 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
def test_quantile_equals_huber_for_low_epsilon(fit_intercept, default_solver):
|
||||
X, y = make_regression(n_samples=100, n_features=20, random_state=0, noise=1.0)
|
||||
alpha = 1e-4
|
||||
huber = HuberRegressor(
|
||||
epsilon=1 + 1e-4, alpha=alpha, fit_intercept=fit_intercept
|
||||
).fit(X, y)
|
||||
quant = QuantileRegressor(
|
||||
alpha=alpha, fit_intercept=fit_intercept, solver=default_solver
|
||||
).fit(X, y)
|
||||
assert_allclose(huber.coef_, quant.coef_, atol=1e-1)
|
||||
if fit_intercept:
|
||||
assert huber.intercept_ == approx(quant.intercept_, abs=1e-1)
|
||||
# check that we still predict fraction
|
||||
assert np.mean(y < quant.predict(X)) == approx(0.5, abs=1e-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, 0.9, 0.05])
|
||||
def test_quantile_estimates_calibration(q, default_solver):
|
||||
# Test that model estimates percentage of points below the prediction
|
||||
X, y = make_regression(n_samples=1000, n_features=20, random_state=0, noise=1.0)
|
||||
quant = QuantileRegressor(
|
||||
quantile=q,
|
||||
alpha=0,
|
||||
solver=default_solver,
|
||||
).fit(X, y)
|
||||
assert np.mean(y < quant.predict(X)) == approx(q, abs=1e-2)
|
||||
|
||||
|
||||
def test_quantile_sample_weight(default_solver):
|
||||
# test that with unequal sample weights we still estimate weighted fraction
|
||||
n = 1000
|
||||
X, y = make_regression(n_samples=n, n_features=5, random_state=0, noise=10.0)
|
||||
weight = np.ones(n)
|
||||
# when we increase weight of upper observations,
|
||||
# estimate of quantile should go up
|
||||
weight[y > y.mean()] = 100
|
||||
quant = QuantileRegressor(quantile=0.5, alpha=1e-8, solver=default_solver)
|
||||
quant.fit(X, y, sample_weight=weight)
|
||||
fraction_below = np.mean(y < quant.predict(X))
|
||||
assert fraction_below > 0.5
|
||||
weighted_fraction_below = np.average(y < quant.predict(X), weights=weight)
|
||||
assert weighted_fraction_below == approx(0.5, abs=3e-2)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sp_version < parse_version("1.6.0"),
|
||||
reason="The `highs` solver is available from the 1.6.0 scipy version",
|
||||
)
|
||||
@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
|
||||
def test_asymmetric_error(quantile, default_solver):
|
||||
"""Test quantile regression for asymmetric distributed targets."""
|
||||
n_samples = 1000
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.concatenate(
|
||||
(
|
||||
np.abs(rng.randn(n_samples)[:, None]),
|
||||
-rng.randint(2, size=(n_samples, 1)),
|
||||
),
|
||||
axis=1,
|
||||
)
|
||||
intercept = 1.23
|
||||
coef = np.array([0.5, -2])
|
||||
# Take care that X @ coef + intercept > 0
|
||||
assert np.min(X @ coef + intercept) > 0
|
||||
# For an exponential distribution with rate lambda, e.g. exp(-lambda * x),
|
||||
# the quantile at level q is:
|
||||
# quantile(q) = - log(1 - q) / lambda
|
||||
# scale = 1/lambda = -quantile(q) / log(1 - q)
|
||||
y = rng.exponential(
|
||||
scale=-(X @ coef + intercept) / np.log(1 - quantile), size=n_samples
|
||||
)
|
||||
model = QuantileRegressor(
|
||||
quantile=quantile,
|
||||
alpha=0,
|
||||
solver=default_solver,
|
||||
).fit(X, y)
|
||||
# This test can be made to pass with any solver but in the interest
|
||||
# of sparing continuous integration resources, the test is performed
|
||||
# with the fastest solver only.
|
||||
|
||||
assert model.intercept_ == approx(intercept, rel=0.2)
|
||||
assert_allclose(model.coef_, coef, rtol=0.6)
|
||||
assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
|
||||
|
||||
# Now compare to Nelder-Mead optimization with L1 penalty
|
||||
alpha = 0.01
|
||||
model.set_params(alpha=alpha).fit(X, y)
|
||||
model_coef = np.r_[model.intercept_, model.coef_]
|
||||
|
||||
def func(coef):
|
||||
loss = mean_pinball_loss(y, X @ coef[1:] + coef[0], alpha=quantile)
|
||||
L1 = np.sum(np.abs(coef[1:]))
|
||||
return loss + alpha * L1
|
||||
|
||||
res = minimize(
|
||||
fun=func,
|
||||
x0=[1, 0, -1],
|
||||
method="Nelder-Mead",
|
||||
tol=1e-12,
|
||||
options={"maxiter": 2000},
|
||||
)
|
||||
|
||||
assert func(model_coef) == approx(func(res.x))
|
||||
assert_allclose(model.intercept_, res.x[0])
|
||||
assert_allclose(model.coef_, res.x[1:])
|
||||
assert_allclose(np.mean(model.predict(X) > y), quantile, atol=1e-2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("quantile", [0.2, 0.5, 0.8])
|
||||
def test_equivariance(quantile, default_solver):
|
||||
"""Test equivariace of quantile regression.
|
||||
|
||||
See Koenker (2005) Quantile Regression, Chapter 2.2.3.
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples, n_features = 100, 5
|
||||
X, y = make_regression(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
n_informative=n_features,
|
||||
noise=0,
|
||||
random_state=rng,
|
||||
shuffle=False,
|
||||
)
|
||||
# make y asymmetric
|
||||
y += rng.exponential(scale=100, size=y.shape)
|
||||
params = dict(alpha=0, solver=default_solver)
|
||||
model1 = QuantileRegressor(quantile=quantile, **params).fit(X, y)
|
||||
|
||||
# coef(q; a*y, X) = a * coef(q; y, X)
|
||||
a = 2.5
|
||||
model2 = QuantileRegressor(quantile=quantile, **params).fit(X, a * y)
|
||||
assert model2.intercept_ == approx(a * model1.intercept_, rel=1e-5)
|
||||
assert_allclose(model2.coef_, a * model1.coef_, rtol=1e-5)
|
||||
|
||||
# coef(1-q; -a*y, X) = -a * coef(q; y, X)
|
||||
model2 = QuantileRegressor(quantile=1 - quantile, **params).fit(X, -a * y)
|
||||
assert model2.intercept_ == approx(-a * model1.intercept_, rel=1e-5)
|
||||
assert_allclose(model2.coef_, -a * model1.coef_, rtol=1e-5)
|
||||
|
||||
# coef(q; y + X @ g, X) = coef(q; y, X) + g
|
||||
g_intercept, g_coef = rng.randn(), rng.randn(n_features)
|
||||
model2 = QuantileRegressor(quantile=quantile, **params)
|
||||
model2.fit(X, y + X @ g_coef + g_intercept)
|
||||
assert model2.intercept_ == approx(model1.intercept_ + g_intercept)
|
||||
assert_allclose(model2.coef_, model1.coef_ + g_coef, rtol=1e-6)
|
||||
|
||||
# coef(q; y, X @ A) = A^-1 @ coef(q; y, X)
|
||||
A = rng.randn(n_features, n_features)
|
||||
model2 = QuantileRegressor(quantile=quantile, **params)
|
||||
model2.fit(X @ A, y)
|
||||
assert model2.intercept_ == approx(model1.intercept_, rel=1e-5)
|
||||
assert_allclose(model2.coef_, np.linalg.solve(A, model1.coef_), rtol=1e-5)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
parse_version(sp_version.base_version) >= parse_version("1.11"),
|
||||
reason="interior-point solver is not available in SciPy 1.11",
|
||||
)
|
||||
@pytest.mark.filterwarnings("ignore:`method='interior-point'` is deprecated")
|
||||
def test_linprog_failure():
|
||||
"""Test that linprog fails."""
|
||||
X = np.linspace(0, 10, num=10).reshape(-1, 1)
|
||||
y = np.linspace(0, 10, num=10)
|
||||
reg = QuantileRegressor(
|
||||
alpha=0, solver="interior-point", solver_options={"maxiter": 1}
|
||||
)
|
||||
|
||||
msg = "Linear programming for QuantileRegressor did not succeed."
|
||||
with pytest.warns(ConvergenceWarning, match=msg):
|
||||
reg.fit(X, y)
|
||||
|
||||
|
||||
@skip_if_32bit
|
||||
@pytest.mark.skipif(
|
||||
sp_version <= parse_version("1.6.0"),
|
||||
reason="Solvers are available as of scipy 1.6.0",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", CSC_CONTAINERS + CSR_CONTAINERS + COO_CONTAINERS
|
||||
)
|
||||
@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
|
||||
@pytest.mark.parametrize("fit_intercept", [True, False])
|
||||
def test_sparse_input(sparse_container, solver, fit_intercept, default_solver):
|
||||
"""Test that sparse and dense X give same results."""
|
||||
X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0)
|
||||
X_sparse = sparse_container(X)
|
||||
alpha = 1e-4
|
||||
quant_dense = QuantileRegressor(
|
||||
alpha=alpha, fit_intercept=fit_intercept, solver=default_solver
|
||||
).fit(X, y)
|
||||
quant_sparse = QuantileRegressor(
|
||||
alpha=alpha, fit_intercept=fit_intercept, solver=solver
|
||||
).fit(X_sparse, y)
|
||||
assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
|
||||
if fit_intercept:
|
||||
assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
|
||||
# check that we still predict fraction
|
||||
assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.57
|
||||
|
||||
|
||||
def test_error_interior_point_future(X_y_data, monkeypatch):
|
||||
"""Check that we will raise a proper error when requesting
|
||||
`solver='interior-point'` in SciPy >= 1.11.
|
||||
"""
|
||||
X, y = X_y_data
|
||||
import sklearn.linear_model._quantile
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setattr(sklearn.linear_model._quantile, "sp_version", parse_version("1.11.0"))
|
||||
err_msg = "Solver interior-point is not anymore available in SciPy >= 1.11.0."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
QuantileRegressor(solver="interior-point").fit(X, y)
|
||||
@@ -0,0 +1,545 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import (
|
||||
LinearRegression,
|
||||
OrthogonalMatchingPursuit,
|
||||
RANSACRegressor,
|
||||
Ridge,
|
||||
)
|
||||
from sklearn.linear_model._ransac import _dynamic_max_trials
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
# Generate coordinates of line
|
||||
X = np.arange(-200, 200)
|
||||
y = 0.2 * X + 20
|
||||
data = np.column_stack([X, y])
|
||||
|
||||
# Add some faulty data
|
||||
rng = np.random.RandomState(1000)
|
||||
outliers = np.unique(rng.randint(len(X), size=200))
|
||||
data[outliers, :] += 50 + rng.rand(len(outliers), 2) * 10
|
||||
|
||||
X = data[:, 0][:, np.newaxis]
|
||||
y = data[:, 1]
|
||||
|
||||
|
||||
def test_ransac_inliers_outliers():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
|
||||
# Estimate parameters of corrupted data
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
# Ground truth / reference inlier mask
|
||||
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
|
||||
ref_inlier_mask[outliers] = False
|
||||
|
||||
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
|
||||
|
||||
|
||||
def test_ransac_is_data_valid():
|
||||
def is_data_valid(X, y):
|
||||
assert X.shape[0] == 2
|
||||
assert y.shape[0] == 2
|
||||
return False
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(10, 2)
|
||||
y = rng.rand(10, 1)
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
is_data_valid=is_data_valid,
|
||||
random_state=0,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
|
||||
def test_ransac_is_model_valid():
|
||||
def is_model_valid(estimator, X, y):
|
||||
assert X.shape[0] == 2
|
||||
assert y.shape[0] == 2
|
||||
return False
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
is_model_valid=is_model_valid,
|
||||
random_state=0,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
|
||||
def test_ransac_max_trials():
|
||||
estimator = LinearRegression()
|
||||
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
max_trials=0,
|
||||
random_state=0,
|
||||
)
|
||||
with pytest.raises(ValueError):
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
# there is a 1e-9 chance it will take these many trials. No good reason
|
||||
# 1e-2 isn't enough, can still happen
|
||||
# 2 is the what ransac defines as min_samples = X.shape[1] + 1
|
||||
max_trials = _dynamic_max_trials(len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9)
|
||||
ransac_estimator = RANSACRegressor(estimator, min_samples=2)
|
||||
for i in range(50):
|
||||
ransac_estimator.set_params(min_samples=2, random_state=i)
|
||||
ransac_estimator.fit(X, y)
|
||||
assert ransac_estimator.n_trials_ < max_trials + 1
|
||||
|
||||
|
||||
def test_ransac_stop_n_inliers():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
stop_n_inliers=2,
|
||||
random_state=0,
|
||||
)
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
assert ransac_estimator.n_trials_ == 1
|
||||
|
||||
|
||||
def test_ransac_stop_score():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
stop_score=0,
|
||||
random_state=0,
|
||||
)
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
assert ransac_estimator.n_trials_ == 1
|
||||
|
||||
|
||||
def test_ransac_score():
|
||||
X = np.arange(100)[:, None]
|
||||
y = np.zeros((100,))
|
||||
y[0] = 1
|
||||
y[1] = 100
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=0.5, random_state=0
|
||||
)
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
assert ransac_estimator.score(X[2:], y[2:]) == 1
|
||||
assert ransac_estimator.score(X[:2], y[:2]) < 1
|
||||
|
||||
|
||||
def test_ransac_predict():
|
||||
X = np.arange(100)[:, None]
|
||||
y = np.zeros((100,))
|
||||
y[0] = 1
|
||||
y[1] = 100
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=0.5, random_state=0
|
||||
)
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
assert_array_equal(ransac_estimator.predict(X), np.zeros(100))
|
||||
|
||||
|
||||
def test_ransac_no_valid_data():
|
||||
def is_data_valid(X, y):
|
||||
return False
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, is_data_valid=is_data_valid, max_trials=5
|
||||
)
|
||||
|
||||
msg = "RANSAC could not find a valid consensus set"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ransac_estimator.fit(X, y)
|
||||
assert ransac_estimator.n_skips_no_inliers_ == 0
|
||||
assert ransac_estimator.n_skips_invalid_data_ == 5
|
||||
assert ransac_estimator.n_skips_invalid_model_ == 0
|
||||
|
||||
|
||||
def test_ransac_no_valid_model():
|
||||
def is_model_valid(estimator, X, y):
|
||||
return False
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, is_model_valid=is_model_valid, max_trials=5
|
||||
)
|
||||
|
||||
msg = "RANSAC could not find a valid consensus set"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ransac_estimator.fit(X, y)
|
||||
assert ransac_estimator.n_skips_no_inliers_ == 0
|
||||
assert ransac_estimator.n_skips_invalid_data_ == 0
|
||||
assert ransac_estimator.n_skips_invalid_model_ == 5
|
||||
|
||||
|
||||
def test_ransac_exceed_max_skips():
|
||||
def is_data_valid(X, y):
|
||||
return False
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, is_data_valid=is_data_valid, max_trials=5, max_skips=3
|
||||
)
|
||||
|
||||
msg = "RANSAC skipped more iterations than `max_skips`"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ransac_estimator.fit(X, y)
|
||||
assert ransac_estimator.n_skips_no_inliers_ == 0
|
||||
assert ransac_estimator.n_skips_invalid_data_ == 4
|
||||
assert ransac_estimator.n_skips_invalid_model_ == 0
|
||||
|
||||
|
||||
def test_ransac_warn_exceed_max_skips():
|
||||
global cause_skip
|
||||
cause_skip = False
|
||||
|
||||
def is_data_valid(X, y):
|
||||
global cause_skip
|
||||
if not cause_skip:
|
||||
cause_skip = True
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, is_data_valid=is_data_valid, max_skips=3, max_trials=5
|
||||
)
|
||||
warning_message = (
|
||||
"RANSAC found a valid consensus set but exited "
|
||||
"early due to skipping more iterations than "
|
||||
"`max_skips`. See estimator attributes for "
|
||||
"diagnostics."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
ransac_estimator.fit(X, y)
|
||||
assert ransac_estimator.n_skips_no_inliers_ == 0
|
||||
assert ransac_estimator.n_skips_invalid_data_ == 4
|
||||
assert ransac_estimator.n_skips_invalid_model_ == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"sparse_container", COO_CONTAINERS + CSR_CONTAINERS + CSC_CONTAINERS
|
||||
)
|
||||
def test_ransac_sparse(sparse_container):
|
||||
X_sparse = sparse_container(X)
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
ransac_estimator.fit(X_sparse, y)
|
||||
|
||||
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
|
||||
ref_inlier_mask[outliers] = False
|
||||
|
||||
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
|
||||
|
||||
|
||||
def test_ransac_none_estimator():
|
||||
estimator = LinearRegression()
|
||||
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
ransac_none_estimator = RANSACRegressor(
|
||||
None, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
|
||||
ransac_estimator.fit(X, y)
|
||||
ransac_none_estimator.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator.predict(X), ransac_none_estimator.predict(X)
|
||||
)
|
||||
|
||||
|
||||
def test_ransac_min_n_samples():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator1 = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
ransac_estimator2 = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2.0 / X.shape[0],
|
||||
residual_threshold=5,
|
||||
random_state=0,
|
||||
)
|
||||
ransac_estimator5 = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
ransac_estimator6 = RANSACRegressor(estimator, residual_threshold=5, random_state=0)
|
||||
ransac_estimator7 = RANSACRegressor(
|
||||
estimator, min_samples=X.shape[0] + 1, residual_threshold=5, random_state=0
|
||||
)
|
||||
# GH #19390
|
||||
ransac_estimator8 = RANSACRegressor(
|
||||
Ridge(), min_samples=None, residual_threshold=5, random_state=0
|
||||
)
|
||||
|
||||
ransac_estimator1.fit(X, y)
|
||||
ransac_estimator2.fit(X, y)
|
||||
ransac_estimator5.fit(X, y)
|
||||
ransac_estimator6.fit(X, y)
|
||||
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator1.predict(X), ransac_estimator2.predict(X)
|
||||
)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator1.predict(X), ransac_estimator5.predict(X)
|
||||
)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator1.predict(X), ransac_estimator6.predict(X)
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
ransac_estimator7.fit(X, y)
|
||||
|
||||
err_msg = "`min_samples` needs to be explicitly set"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ransac_estimator8.fit(X, y)
|
||||
|
||||
|
||||
def test_ransac_multi_dimensional_targets():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
|
||||
# 3-D target values
|
||||
yyy = np.column_stack([y, y, y])
|
||||
|
||||
# Estimate parameters of corrupted data
|
||||
ransac_estimator.fit(X, yyy)
|
||||
|
||||
# Ground truth / reference inlier mask
|
||||
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
|
||||
ref_inlier_mask[outliers] = False
|
||||
|
||||
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
|
||||
|
||||
|
||||
def test_ransac_residual_loss():
|
||||
def loss_multi1(y_true, y_pred):
|
||||
return np.sum(np.abs(y_true - y_pred), axis=1)
|
||||
|
||||
def loss_multi2(y_true, y_pred):
|
||||
return np.sum((y_true - y_pred) ** 2, axis=1)
|
||||
|
||||
def loss_mono(y_true, y_pred):
|
||||
return np.abs(y_true - y_pred)
|
||||
|
||||
yyy = np.column_stack([y, y, y])
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator0 = RANSACRegressor(
|
||||
estimator, min_samples=2, residual_threshold=5, random_state=0
|
||||
)
|
||||
ransac_estimator1 = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
random_state=0,
|
||||
loss=loss_multi1,
|
||||
)
|
||||
ransac_estimator2 = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
random_state=0,
|
||||
loss=loss_multi2,
|
||||
)
|
||||
|
||||
# multi-dimensional
|
||||
ransac_estimator0.fit(X, yyy)
|
||||
ransac_estimator1.fit(X, yyy)
|
||||
ransac_estimator2.fit(X, yyy)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator0.predict(X), ransac_estimator1.predict(X)
|
||||
)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
|
||||
)
|
||||
|
||||
# one-dimensional
|
||||
ransac_estimator0.fit(X, y)
|
||||
ransac_estimator2.loss = loss_mono
|
||||
ransac_estimator2.fit(X, y)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
|
||||
)
|
||||
ransac_estimator3 = RANSACRegressor(
|
||||
estimator,
|
||||
min_samples=2,
|
||||
residual_threshold=5,
|
||||
random_state=0,
|
||||
loss="squared_error",
|
||||
)
|
||||
ransac_estimator3.fit(X, y)
|
||||
assert_array_almost_equal(
|
||||
ransac_estimator0.predict(X), ransac_estimator2.predict(X)
|
||||
)
|
||||
|
||||
|
||||
def test_ransac_default_residual_threshold():
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(estimator, min_samples=2, random_state=0)
|
||||
|
||||
# Estimate parameters of corrupted data
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
# Ground truth / reference inlier mask
|
||||
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
|
||||
ref_inlier_mask[outliers] = False
|
||||
|
||||
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
|
||||
|
||||
|
||||
def test_ransac_dynamic_max_trials():
|
||||
# Numbers hand-calculated and confirmed on page 119 (Table 4.3) in
|
||||
# Hartley, R.~I. and Zisserman, A., 2004,
|
||||
# Multiple View Geometry in Computer Vision, Second Edition,
|
||||
# Cambridge University Press, ISBN: 0521540518
|
||||
|
||||
# e = 0%, min_samples = X
|
||||
assert _dynamic_max_trials(100, 100, 2, 0.99) == 1
|
||||
|
||||
# e = 5%, min_samples = 2
|
||||
assert _dynamic_max_trials(95, 100, 2, 0.99) == 2
|
||||
# e = 10%, min_samples = 2
|
||||
assert _dynamic_max_trials(90, 100, 2, 0.99) == 3
|
||||
# e = 30%, min_samples = 2
|
||||
assert _dynamic_max_trials(70, 100, 2, 0.99) == 7
|
||||
# e = 50%, min_samples = 2
|
||||
assert _dynamic_max_trials(50, 100, 2, 0.99) == 17
|
||||
|
||||
# e = 5%, min_samples = 8
|
||||
assert _dynamic_max_trials(95, 100, 8, 0.99) == 5
|
||||
# e = 10%, min_samples = 8
|
||||
assert _dynamic_max_trials(90, 100, 8, 0.99) == 9
|
||||
# e = 30%, min_samples = 8
|
||||
assert _dynamic_max_trials(70, 100, 8, 0.99) == 78
|
||||
# e = 50%, min_samples = 8
|
||||
assert _dynamic_max_trials(50, 100, 8, 0.99) == 1177
|
||||
|
||||
# e = 0%, min_samples = 10
|
||||
assert _dynamic_max_trials(1, 100, 10, 0) == 0
|
||||
assert _dynamic_max_trials(1, 100, 10, 1) == float("inf")
|
||||
|
||||
|
||||
def test_ransac_fit_sample_weight():
|
||||
ransac_estimator = RANSACRegressor(random_state=0)
|
||||
n_samples = y.shape[0]
|
||||
weights = np.ones(n_samples)
|
||||
ransac_estimator.fit(X, y, sample_weight=weights)
|
||||
# sanity check
|
||||
assert ransac_estimator.inlier_mask_.shape[0] == n_samples
|
||||
|
||||
ref_inlier_mask = np.ones_like(ransac_estimator.inlier_mask_).astype(np.bool_)
|
||||
ref_inlier_mask[outliers] = False
|
||||
# check that mask is correct
|
||||
assert_array_equal(ransac_estimator.inlier_mask_, ref_inlier_mask)
|
||||
|
||||
# check that fit(X) = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
|
||||
# X = X1 repeated n1 times, X2 repeated n2 times and so forth
|
||||
random_state = check_random_state(0)
|
||||
X_ = random_state.randint(0, 200, [10, 1])
|
||||
y_ = np.ndarray.flatten(0.2 * X_ + 2)
|
||||
sample_weight = random_state.randint(0, 10, 10)
|
||||
outlier_X = random_state.randint(0, 1000, [1, 1])
|
||||
outlier_weight = random_state.randint(0, 10, 1)
|
||||
outlier_y = random_state.randint(-1000, 0, 1)
|
||||
|
||||
X_flat = np.append(
|
||||
np.repeat(X_, sample_weight, axis=0),
|
||||
np.repeat(outlier_X, outlier_weight, axis=0),
|
||||
axis=0,
|
||||
)
|
||||
y_flat = np.ndarray.flatten(
|
||||
np.append(
|
||||
np.repeat(y_, sample_weight, axis=0),
|
||||
np.repeat(outlier_y, outlier_weight, axis=0),
|
||||
axis=0,
|
||||
)
|
||||
)
|
||||
ransac_estimator.fit(X_flat, y_flat)
|
||||
ref_coef_ = ransac_estimator.estimator_.coef_
|
||||
|
||||
sample_weight = np.append(sample_weight, outlier_weight)
|
||||
X_ = np.append(X_, outlier_X, axis=0)
|
||||
y_ = np.append(y_, outlier_y)
|
||||
ransac_estimator.fit(X_, y_, sample_weight=sample_weight)
|
||||
|
||||
assert_allclose(ransac_estimator.estimator_.coef_, ref_coef_)
|
||||
|
||||
# check that if estimator.fit doesn't support
|
||||
# sample_weight, raises error
|
||||
estimator = OrthogonalMatchingPursuit()
|
||||
ransac_estimator = RANSACRegressor(estimator, min_samples=10)
|
||||
|
||||
err_msg = f"{estimator.__class__.__name__} does not support sample_weight."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
ransac_estimator.fit(X, y, sample_weight=weights)
|
||||
|
||||
|
||||
def test_ransac_final_model_fit_sample_weight():
|
||||
X, y = make_regression(n_samples=1000, random_state=10)
|
||||
rng = check_random_state(42)
|
||||
sample_weight = rng.randint(1, 4, size=y.shape[0])
|
||||
sample_weight = sample_weight / sample_weight.sum()
|
||||
ransac = RANSACRegressor(random_state=0)
|
||||
ransac.fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
final_model = LinearRegression()
|
||||
mask_samples = ransac.inlier_mask_
|
||||
final_model.fit(
|
||||
X[mask_samples], y[mask_samples], sample_weight=sample_weight[mask_samples]
|
||||
)
|
||||
|
||||
assert_allclose(ransac.estimator_.coef_, final_model.coef_, atol=1e-12)
|
||||
|
||||
|
||||
def test_perfect_horizontal_line():
|
||||
"""Check that we can fit a line where all samples are inliers.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/19497
|
||||
"""
|
||||
X = np.arange(100)[:, None]
|
||||
y = np.zeros((100,))
|
||||
|
||||
estimator = LinearRegression()
|
||||
ransac_estimator = RANSACRegressor(estimator, random_state=0)
|
||||
ransac_estimator.fit(X, y)
|
||||
|
||||
assert_allclose(ransac_estimator.estimator_.coef_, 0.0)
|
||||
assert_allclose(ransac_estimator.estimator_.intercept_, 0.0)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,947 @@
|
||||
# Authors: Danny Sullivan <dbsullivan23@gmail.com>
|
||||
# Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.special import logsumexp
|
||||
|
||||
from sklearn._loss.loss import HalfMultinomialLoss
|
||||
from sklearn.base import clone
|
||||
from sklearn.datasets import load_iris, make_blobs, make_classification
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from sklearn.linear_model._base import make_dataset
|
||||
from sklearn.linear_model._linear_loss import LinearModelLoss
|
||||
from sklearn.linear_model._sag import get_auto_step_size
|
||||
from sklearn.linear_model._sag_fast import _multinomial_grad_loss_all_samples
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
|
||||
from sklearn.utils import check_random_state, compute_class_weight
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
)
|
||||
from sklearn.utils.extmath import row_norms
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
iris = load_iris()
|
||||
|
||||
|
||||
# this is used for sag classification
|
||||
def log_dloss(p, y):
|
||||
z = p * y
|
||||
# approximately equal and saves the computation of the log
|
||||
if z > 18.0:
|
||||
return math.exp(-z) * -y
|
||||
if z < -18.0:
|
||||
return -y
|
||||
return -y / (math.exp(z) + 1.0)
|
||||
|
||||
|
||||
def log_loss(p, y):
|
||||
return np.mean(np.log(1.0 + np.exp(-y * p)))
|
||||
|
||||
|
||||
# this is used for sag regression
|
||||
def squared_dloss(p, y):
|
||||
return p - y
|
||||
|
||||
|
||||
def squared_loss(p, y):
|
||||
return np.mean(0.5 * (p - y) * (p - y))
|
||||
|
||||
|
||||
# function for measuring the log loss
|
||||
def get_pobj(w, alpha, myX, myy, loss):
|
||||
w = w.ravel()
|
||||
pred = np.dot(myX, w)
|
||||
p = loss(pred, myy)
|
||||
p += alpha * w.dot(w) / 2.0
|
||||
return p
|
||||
|
||||
|
||||
def sag(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=1,
|
||||
dloss=None,
|
||||
sparse=False,
|
||||
sample_weight=None,
|
||||
fit_intercept=True,
|
||||
saga=False,
|
||||
):
|
||||
n_samples, n_features = X.shape[0], X.shape[1]
|
||||
|
||||
weights = np.zeros(X.shape[1])
|
||||
sum_gradient = np.zeros(X.shape[1])
|
||||
gradient_memory = np.zeros((n_samples, n_features))
|
||||
|
||||
intercept = 0.0
|
||||
intercept_sum_gradient = 0.0
|
||||
intercept_gradient_memory = np.zeros(n_samples)
|
||||
|
||||
rng = np.random.RandomState(77)
|
||||
decay = 1.0
|
||||
seen = set()
|
||||
|
||||
# sparse data has a fixed decay of .01
|
||||
if sparse:
|
||||
decay = 0.01
|
||||
|
||||
for epoch in range(n_iter):
|
||||
for k in range(n_samples):
|
||||
idx = int(rng.rand() * n_samples)
|
||||
# idx = k
|
||||
entry = X[idx]
|
||||
seen.add(idx)
|
||||
p = np.dot(entry, weights) + intercept
|
||||
gradient = dloss(p, y[idx])
|
||||
if sample_weight is not None:
|
||||
gradient *= sample_weight[idx]
|
||||
update = entry * gradient + alpha * weights
|
||||
gradient_correction = update - gradient_memory[idx]
|
||||
sum_gradient += gradient_correction
|
||||
gradient_memory[idx] = update
|
||||
if saga:
|
||||
weights -= gradient_correction * step_size * (1 - 1.0 / len(seen))
|
||||
|
||||
if fit_intercept:
|
||||
gradient_correction = gradient - intercept_gradient_memory[idx]
|
||||
intercept_gradient_memory[idx] = gradient
|
||||
intercept_sum_gradient += gradient_correction
|
||||
gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
|
||||
if saga:
|
||||
intercept -= (
|
||||
step_size * intercept_sum_gradient / len(seen) * decay
|
||||
) + gradient_correction
|
||||
else:
|
||||
intercept -= step_size * intercept_sum_gradient / len(seen) * decay
|
||||
|
||||
weights -= step_size * sum_gradient / len(seen)
|
||||
|
||||
return weights, intercept
|
||||
|
||||
|
||||
def sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=1,
|
||||
dloss=None,
|
||||
sample_weight=None,
|
||||
sparse=False,
|
||||
fit_intercept=True,
|
||||
saga=False,
|
||||
random_state=0,
|
||||
):
|
||||
if step_size * alpha == 1.0:
|
||||
raise ZeroDivisionError(
|
||||
"Sparse sag does not handle the case step_size * alpha == 1"
|
||||
)
|
||||
n_samples, n_features = X.shape[0], X.shape[1]
|
||||
|
||||
weights = np.zeros(n_features)
|
||||
sum_gradient = np.zeros(n_features)
|
||||
last_updated = np.zeros(n_features, dtype=int)
|
||||
gradient_memory = np.zeros(n_samples)
|
||||
rng = check_random_state(random_state)
|
||||
intercept = 0.0
|
||||
intercept_sum_gradient = 0.0
|
||||
wscale = 1.0
|
||||
decay = 1.0
|
||||
seen = set()
|
||||
|
||||
c_sum = np.zeros(n_iter * n_samples)
|
||||
|
||||
# sparse data has a fixed decay of .01
|
||||
if sparse:
|
||||
decay = 0.01
|
||||
|
||||
counter = 0
|
||||
for epoch in range(n_iter):
|
||||
for k in range(n_samples):
|
||||
# idx = k
|
||||
idx = int(rng.rand() * n_samples)
|
||||
entry = X[idx]
|
||||
seen.add(idx)
|
||||
|
||||
if counter >= 1:
|
||||
for j in range(n_features):
|
||||
if last_updated[j] == 0:
|
||||
weights[j] -= c_sum[counter - 1] * sum_gradient[j]
|
||||
else:
|
||||
weights[j] -= (
|
||||
c_sum[counter - 1] - c_sum[last_updated[j] - 1]
|
||||
) * sum_gradient[j]
|
||||
last_updated[j] = counter
|
||||
|
||||
p = (wscale * np.dot(entry, weights)) + intercept
|
||||
gradient = dloss(p, y[idx])
|
||||
|
||||
if sample_weight is not None:
|
||||
gradient *= sample_weight[idx]
|
||||
|
||||
update = entry * gradient
|
||||
gradient_correction = update - (gradient_memory[idx] * entry)
|
||||
sum_gradient += gradient_correction
|
||||
if saga:
|
||||
for j in range(n_features):
|
||||
weights[j] -= (
|
||||
gradient_correction[j]
|
||||
* step_size
|
||||
* (1 - 1.0 / len(seen))
|
||||
/ wscale
|
||||
)
|
||||
|
||||
if fit_intercept:
|
||||
gradient_correction = gradient - gradient_memory[idx]
|
||||
intercept_sum_gradient += gradient_correction
|
||||
gradient_correction *= step_size * (1.0 - 1.0 / len(seen))
|
||||
if saga:
|
||||
intercept -= (
|
||||
step_size * intercept_sum_gradient / len(seen) * decay
|
||||
) + gradient_correction
|
||||
else:
|
||||
intercept -= step_size * intercept_sum_gradient / len(seen) * decay
|
||||
|
||||
gradient_memory[idx] = gradient
|
||||
|
||||
wscale *= 1.0 - alpha * step_size
|
||||
if counter == 0:
|
||||
c_sum[0] = step_size / (wscale * len(seen))
|
||||
else:
|
||||
c_sum[counter] = c_sum[counter - 1] + step_size / (wscale * len(seen))
|
||||
|
||||
if counter >= 1 and wscale < 1e-9:
|
||||
for j in range(n_features):
|
||||
if last_updated[j] == 0:
|
||||
weights[j] -= c_sum[counter] * sum_gradient[j]
|
||||
else:
|
||||
weights[j] -= (
|
||||
c_sum[counter] - c_sum[last_updated[j] - 1]
|
||||
) * sum_gradient[j]
|
||||
last_updated[j] = counter + 1
|
||||
c_sum[counter] = 0
|
||||
weights *= wscale
|
||||
wscale = 1.0
|
||||
|
||||
counter += 1
|
||||
|
||||
for j in range(n_features):
|
||||
if last_updated[j] == 0:
|
||||
weights[j] -= c_sum[counter - 1] * sum_gradient[j]
|
||||
else:
|
||||
weights[j] -= (
|
||||
c_sum[counter - 1] - c_sum[last_updated[j] - 1]
|
||||
) * sum_gradient[j]
|
||||
weights *= wscale
|
||||
return weights, intercept
|
||||
|
||||
|
||||
def get_step_size(X, alpha, fit_intercept, classification=True):
|
||||
if classification:
|
||||
return 4.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + 4.0 * alpha)
|
||||
else:
|
||||
return 1.0 / (np.max(np.sum(X * X, axis=1)) + fit_intercept + alpha)
|
||||
|
||||
|
||||
def test_classifier_matching():
|
||||
n_samples = 20
|
||||
X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
|
||||
y[y == 0] = -1
|
||||
alpha = 1.1
|
||||
fit_intercept = True
|
||||
step_size = get_step_size(X, alpha, fit_intercept)
|
||||
for solver in ["sag", "saga"]:
|
||||
if solver == "sag":
|
||||
n_iter = 80
|
||||
else:
|
||||
# SAGA variance w.r.t. stream order is higher
|
||||
n_iter = 300
|
||||
clf = LogisticRegression(
|
||||
solver=solver,
|
||||
fit_intercept=fit_intercept,
|
||||
tol=1e-11,
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=n_iter,
|
||||
random_state=10,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
weights, intercept = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
saga=solver == "saga",
|
||||
)
|
||||
weights2, intercept2 = sag(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
saga=solver == "saga",
|
||||
)
|
||||
weights = np.atleast_2d(weights)
|
||||
intercept = np.atleast_1d(intercept)
|
||||
weights2 = np.atleast_2d(weights2)
|
||||
intercept2 = np.atleast_1d(intercept2)
|
||||
|
||||
assert_array_almost_equal(weights, clf.coef_, decimal=9)
|
||||
assert_array_almost_equal(intercept, clf.intercept_, decimal=9)
|
||||
assert_array_almost_equal(weights2, clf.coef_, decimal=9)
|
||||
assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
|
||||
|
||||
|
||||
def test_regressor_matching():
|
||||
n_samples = 10
|
||||
n_features = 5
|
||||
|
||||
rng = np.random.RandomState(10)
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
true_w = rng.normal(size=n_features)
|
||||
y = X.dot(true_w)
|
||||
|
||||
alpha = 1.0
|
||||
n_iter = 100
|
||||
fit_intercept = True
|
||||
|
||||
step_size = get_step_size(X, alpha, fit_intercept, classification=False)
|
||||
clf = Ridge(
|
||||
fit_intercept=fit_intercept,
|
||||
tol=0.00000000001,
|
||||
solver="sag",
|
||||
alpha=alpha * n_samples,
|
||||
max_iter=n_iter,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
|
||||
weights1, intercept1 = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=squared_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
weights2, intercept2 = sag(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=squared_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
assert_allclose(weights1, clf.coef_)
|
||||
assert_allclose(intercept1, clf.intercept_)
|
||||
assert_allclose(weights2, clf.coef_)
|
||||
assert_allclose(intercept2, clf.intercept_)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_pobj_matches_logistic_regression(csr_container):
|
||||
"""tests if the sag pobj matches log reg"""
|
||||
n_samples = 100
|
||||
alpha = 1.0
|
||||
max_iter = 20
|
||||
X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
|
||||
|
||||
clf1 = LogisticRegression(
|
||||
solver="sag",
|
||||
fit_intercept=False,
|
||||
tol=0.0000001,
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=max_iter,
|
||||
random_state=10,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
clf3 = LogisticRegression(
|
||||
fit_intercept=False,
|
||||
tol=0.0000001,
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=max_iter,
|
||||
random_state=10,
|
||||
)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
clf3.fit(X, y)
|
||||
|
||||
pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss)
|
||||
pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss)
|
||||
pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss)
|
||||
|
||||
assert_array_almost_equal(pobj1, pobj2, decimal=4)
|
||||
assert_array_almost_equal(pobj2, pobj3, decimal=4)
|
||||
assert_array_almost_equal(pobj3, pobj1, decimal=4)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_pobj_matches_ridge_regression(csr_container):
|
||||
"""tests if the sag pobj matches ridge reg"""
|
||||
n_samples = 100
|
||||
n_features = 10
|
||||
alpha = 1.0
|
||||
n_iter = 100
|
||||
fit_intercept = False
|
||||
rng = np.random.RandomState(10)
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
true_w = rng.normal(size=n_features)
|
||||
y = X.dot(true_w)
|
||||
|
||||
clf1 = Ridge(
|
||||
fit_intercept=fit_intercept,
|
||||
tol=0.00000000001,
|
||||
solver="sag",
|
||||
alpha=alpha,
|
||||
max_iter=n_iter,
|
||||
random_state=42,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
clf3 = Ridge(
|
||||
fit_intercept=fit_intercept,
|
||||
tol=0.00001,
|
||||
solver="lsqr",
|
||||
alpha=alpha,
|
||||
max_iter=n_iter,
|
||||
random_state=42,
|
||||
)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
clf3.fit(X, y)
|
||||
|
||||
pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss)
|
||||
pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss)
|
||||
pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss)
|
||||
|
||||
assert_array_almost_equal(pobj1, pobj2, decimal=4)
|
||||
assert_array_almost_equal(pobj1, pobj3, decimal=4)
|
||||
assert_array_almost_equal(pobj3, pobj2, decimal=4)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_regressor_computed_correctly(csr_container):
|
||||
"""tests if the sag regressor is computed correctly"""
|
||||
alpha = 0.1
|
||||
n_features = 10
|
||||
n_samples = 40
|
||||
max_iter = 100
|
||||
tol = 0.000001
|
||||
fit_intercept = True
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
w = rng.normal(size=n_features)
|
||||
y = np.dot(X, w) + 2.0
|
||||
step_size = get_step_size(X, alpha, fit_intercept, classification=False)
|
||||
|
||||
clf1 = Ridge(
|
||||
fit_intercept=fit_intercept,
|
||||
tol=tol,
|
||||
solver="sag",
|
||||
alpha=alpha * n_samples,
|
||||
max_iter=max_iter,
|
||||
random_state=rng,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
|
||||
spweights1, spintercept1 = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=max_iter,
|
||||
dloss=squared_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
random_state=rng,
|
||||
)
|
||||
|
||||
spweights2, spintercept2 = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=max_iter,
|
||||
dloss=squared_dloss,
|
||||
sparse=True,
|
||||
fit_intercept=fit_intercept,
|
||||
random_state=rng,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3)
|
||||
assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
|
||||
|
||||
# TODO: uncomment when sparse Ridge with intercept will be fixed (#4710)
|
||||
# assert_array_almost_equal(clf2.coef_.ravel(),
|
||||
# spweights2.ravel(),
|
||||
# decimal=3)
|
||||
# assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)'''
|
||||
|
||||
|
||||
def test_get_auto_step_size():
|
||||
X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
|
||||
alpha = 1.2
|
||||
fit_intercept = False
|
||||
# sum the squares of the second sample because that's the largest
|
||||
max_squared_sum = 4 + 9 + 16
|
||||
max_squared_sum_ = row_norms(X, squared=True).max()
|
||||
n_samples = X.shape[0]
|
||||
assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)
|
||||
|
||||
for saga in [True, False]:
|
||||
for fit_intercept in (True, False):
|
||||
if saga:
|
||||
L_sqr = max_squared_sum + alpha + int(fit_intercept)
|
||||
L_log = (max_squared_sum + 4.0 * alpha + int(fit_intercept)) / 4.0
|
||||
mun_sqr = min(2 * n_samples * alpha, L_sqr)
|
||||
mun_log = min(2 * n_samples * alpha, L_log)
|
||||
step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
|
||||
step_size_log = 1 / (2 * L_log + mun_log)
|
||||
else:
|
||||
step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept))
|
||||
step_size_log = 4.0 / (
|
||||
max_squared_sum + 4.0 * alpha + int(fit_intercept)
|
||||
)
|
||||
|
||||
step_size_sqr_ = get_auto_step_size(
|
||||
max_squared_sum_,
|
||||
alpha,
|
||||
"squared",
|
||||
fit_intercept,
|
||||
n_samples=n_samples,
|
||||
is_saga=saga,
|
||||
)
|
||||
step_size_log_ = get_auto_step_size(
|
||||
max_squared_sum_,
|
||||
alpha,
|
||||
"log",
|
||||
fit_intercept,
|
||||
n_samples=n_samples,
|
||||
is_saga=saga,
|
||||
)
|
||||
|
||||
assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
|
||||
assert_almost_equal(step_size_log, step_size_log_, decimal=4)
|
||||
|
||||
msg = "Unknown loss function for SAG solver, got wrong instead of"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
get_auto_step_size(max_squared_sum_, alpha, "wrong", fit_intercept)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(3)) # locally tested with 1000 seeds
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_regressor(seed, csr_container):
|
||||
"""tests if the sag regressor performs well"""
|
||||
xmin, xmax = -5, 5
|
||||
n_samples = 300
|
||||
tol = 0.001
|
||||
max_iter = 100
|
||||
alpha = 0.1
|
||||
rng = np.random.RandomState(seed)
|
||||
X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1)
|
||||
|
||||
# simple linear function without noise
|
||||
y = 0.5 * X.ravel()
|
||||
|
||||
clf1 = Ridge(
|
||||
tol=tol,
|
||||
solver="sag",
|
||||
max_iter=max_iter,
|
||||
alpha=alpha * n_samples,
|
||||
random_state=rng,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
score1 = clf1.score(X, y)
|
||||
score2 = clf2.score(X, y)
|
||||
assert score1 > 0.98
|
||||
assert score2 > 0.98
|
||||
|
||||
# simple linear function with noise
|
||||
y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel()
|
||||
|
||||
clf1 = Ridge(tol=tol, solver="sag", max_iter=max_iter, alpha=alpha * n_samples)
|
||||
clf2 = clone(clf1)
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
score1 = clf1.score(X, y)
|
||||
score2 = clf2.score(X, y)
|
||||
assert score1 > 0.45
|
||||
assert score2 > 0.45
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_classifier_computed_correctly(csr_container):
|
||||
"""tests if the binary classifier is computed correctly"""
|
||||
alpha = 0.1
|
||||
n_samples = 50
|
||||
n_iter = 50
|
||||
tol = 0.00001
|
||||
fit_intercept = True
|
||||
X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1)
|
||||
step_size = get_step_size(X, alpha, fit_intercept, classification=True)
|
||||
classes = np.unique(y)
|
||||
y_tmp = np.ones(n_samples)
|
||||
y_tmp[y != classes[1]] = -1
|
||||
y = y_tmp
|
||||
|
||||
clf1 = LogisticRegression(
|
||||
solver="sag",
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=n_iter,
|
||||
tol=tol,
|
||||
random_state=77,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
|
||||
spweights, spintercept = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
spweights2, spintercept2 = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
sparse=True,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
|
||||
assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
|
||||
|
||||
assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
|
||||
assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sag_multiclass_computed_correctly(csr_container):
|
||||
"""tests if the multiclass classifier is computed correctly"""
|
||||
alpha = 0.1
|
||||
n_samples = 20
|
||||
tol = 1e-5
|
||||
max_iter = 70
|
||||
fit_intercept = True
|
||||
X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1)
|
||||
step_size = get_step_size(X, alpha, fit_intercept, classification=True)
|
||||
classes = np.unique(y)
|
||||
|
||||
clf1 = OneVsRestClassifier(
|
||||
LogisticRegression(
|
||||
solver="sag",
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
random_state=77,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
|
||||
coef1 = []
|
||||
intercept1 = []
|
||||
coef2 = []
|
||||
intercept2 = []
|
||||
for cl in classes:
|
||||
y_encoded = np.ones(n_samples)
|
||||
y_encoded[y != cl] = -1
|
||||
|
||||
spweights1, spintercept1 = sag_sparse(
|
||||
X,
|
||||
y_encoded,
|
||||
step_size,
|
||||
alpha,
|
||||
dloss=log_dloss,
|
||||
n_iter=max_iter,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
spweights2, spintercept2 = sag_sparse(
|
||||
X,
|
||||
y_encoded,
|
||||
step_size,
|
||||
alpha,
|
||||
dloss=log_dloss,
|
||||
n_iter=max_iter,
|
||||
sparse=True,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
coef1.append(spweights1)
|
||||
intercept1.append(spintercept1)
|
||||
|
||||
coef2.append(spweights2)
|
||||
intercept2.append(spintercept2)
|
||||
|
||||
coef1 = np.vstack(coef1)
|
||||
intercept1 = np.array(intercept1)
|
||||
coef2 = np.vstack(coef2)
|
||||
intercept2 = np.array(intercept2)
|
||||
|
||||
for i, cl in enumerate(classes):
|
||||
assert_allclose(clf1.estimators_[i].coef_.ravel(), coef1[i], rtol=1e-2)
|
||||
assert_allclose(clf1.estimators_[i].intercept_, intercept1[i], rtol=1e-1)
|
||||
|
||||
assert_allclose(clf2.estimators_[i].coef_.ravel(), coef2[i], rtol=1e-2)
|
||||
# Note the very crude accuracy, i.e. high rtol.
|
||||
assert_allclose(clf2.estimators_[i].intercept_, intercept2[i], rtol=5e-1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_classifier_results(csr_container):
|
||||
"""tests if classifier results match target"""
|
||||
alpha = 0.1
|
||||
n_features = 20
|
||||
n_samples = 10
|
||||
tol = 0.01
|
||||
max_iter = 200
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.normal(size=(n_samples, n_features))
|
||||
w = rng.normal(size=n_features)
|
||||
y = np.dot(X, w)
|
||||
y = np.sign(y)
|
||||
clf1 = LogisticRegression(
|
||||
solver="sag",
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=max_iter,
|
||||
tol=tol,
|
||||
random_state=77,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
pred1 = clf1.predict(X)
|
||||
pred2 = clf2.predict(X)
|
||||
assert_almost_equal(pred1, y, decimal=12)
|
||||
assert_almost_equal(pred2, y, decimal=12)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:The max_iter was reached")
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_binary_classifier_class_weight(csr_container):
|
||||
"""tests binary classifier with classweights for each class"""
|
||||
alpha = 0.1
|
||||
n_samples = 50
|
||||
n_iter = 20
|
||||
tol = 0.00001
|
||||
fit_intercept = True
|
||||
X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1)
|
||||
step_size = get_step_size(X, alpha, fit_intercept, classification=True)
|
||||
classes = np.unique(y)
|
||||
y_tmp = np.ones(n_samples)
|
||||
y_tmp[y != classes[1]] = -1
|
||||
y = y_tmp
|
||||
|
||||
class_weight = {1: 0.45, -1: 0.55}
|
||||
clf1 = LogisticRegression(
|
||||
solver="sag",
|
||||
C=1.0 / alpha / n_samples,
|
||||
max_iter=n_iter,
|
||||
tol=tol,
|
||||
random_state=77,
|
||||
fit_intercept=fit_intercept,
|
||||
class_weight=class_weight,
|
||||
)
|
||||
clf2 = clone(clf1)
|
||||
|
||||
clf1.fit(X, y)
|
||||
clf2.fit(csr_container(X), y)
|
||||
|
||||
le = LabelEncoder()
|
||||
class_weight_ = compute_class_weight(class_weight, classes=np.unique(y), y=y)
|
||||
sample_weight = class_weight_[le.fit_transform(y)]
|
||||
spweights, spintercept = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
sample_weight=sample_weight,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
spweights2, spintercept2 = sag_sparse(
|
||||
X,
|
||||
y,
|
||||
step_size,
|
||||
alpha,
|
||||
n_iter=n_iter,
|
||||
dloss=log_dloss,
|
||||
sparse=True,
|
||||
sample_weight=sample_weight,
|
||||
fit_intercept=fit_intercept,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2)
|
||||
assert_almost_equal(clf1.intercept_, spintercept, decimal=1)
|
||||
|
||||
assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2)
|
||||
assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
|
||||
|
||||
|
||||
def test_classifier_single_class():
|
||||
"""tests if ValueError is thrown with only one class"""
|
||||
X = [[1, 2], [3, 4]]
|
||||
y = [1, 1]
|
||||
|
||||
msg = "This solver needs samples of at least 2 classes in the data"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
LogisticRegression(solver="sag").fit(X, y)
|
||||
|
||||
|
||||
def test_step_size_alpha_error():
|
||||
X = [[0, 0], [0, 0]]
|
||||
y = [1, -1]
|
||||
fit_intercept = False
|
||||
alpha = 1.0
|
||||
msg = re.escape(
|
||||
"Current sag implementation does not handle the case"
|
||||
" step_size * alpha_scaled == 1"
|
||||
)
|
||||
|
||||
clf1 = LogisticRegression(solver="sag", C=1.0 / alpha, fit_intercept=fit_intercept)
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
clf1.fit(X, y)
|
||||
|
||||
clf2 = Ridge(fit_intercept=fit_intercept, solver="sag", alpha=alpha)
|
||||
with pytest.raises(ZeroDivisionError, match=msg):
|
||||
clf2.fit(X, y)
|
||||
|
||||
|
||||
def test_multinomial_loss():
|
||||
# test if the multinomial loss and gradient computations are consistent
|
||||
X, y = iris.data, iris.target.astype(np.float64)
|
||||
n_samples, n_features = X.shape
|
||||
n_classes = len(np.unique(y))
|
||||
|
||||
rng = check_random_state(42)
|
||||
weights = rng.randn(n_features, n_classes)
|
||||
intercept = rng.randn(n_classes)
|
||||
sample_weights = np.abs(rng.randn(n_samples))
|
||||
|
||||
# compute loss and gradient like in multinomial SAG
|
||||
dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
|
||||
loss_1, grad_1 = _multinomial_grad_loss_all_samples(
|
||||
dataset, weights, intercept, n_samples, n_features, n_classes
|
||||
)
|
||||
# compute loss and gradient like in multinomial LogisticRegression
|
||||
loss = LinearModelLoss(
|
||||
base_loss=HalfMultinomialLoss(n_classes=n_classes),
|
||||
fit_intercept=True,
|
||||
)
|
||||
weights_intercept = np.vstack((weights, intercept)).T
|
||||
loss_2, grad_2 = loss.loss_gradient(
|
||||
weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
|
||||
)
|
||||
grad_2 = grad_2[:, :-1].T
|
||||
# convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
|
||||
loss_2 *= np.sum(sample_weights)
|
||||
grad_2 *= np.sum(sample_weights)
|
||||
|
||||
# comparison
|
||||
assert_array_almost_equal(grad_1, grad_2)
|
||||
assert_almost_equal(loss_1, loss_2)
|
||||
|
||||
|
||||
def test_multinomial_loss_ground_truth():
|
||||
# n_samples, n_features, n_classes = 4, 2, 3
|
||||
n_classes = 3
|
||||
X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]])
|
||||
y = np.array([0, 1, 2, 0], dtype=np.float64)
|
||||
lbin = LabelBinarizer()
|
||||
Y_bin = lbin.fit_transform(y)
|
||||
|
||||
weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]])
|
||||
intercept = np.array([1.0, 0, -0.2])
|
||||
sample_weights = np.array([0.8, 1, 1, 0.8])
|
||||
|
||||
prediction = np.dot(X, weights) + intercept
|
||||
logsumexp_prediction = logsumexp(prediction, axis=1)
|
||||
p = prediction - logsumexp_prediction[:, np.newaxis]
|
||||
loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum()
|
||||
diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin)
|
||||
grad_1 = np.dot(X.T, diff)
|
||||
|
||||
loss = LinearModelLoss(
|
||||
base_loss=HalfMultinomialLoss(n_classes=n_classes),
|
||||
fit_intercept=True,
|
||||
)
|
||||
weights_intercept = np.vstack((weights, intercept)).T
|
||||
loss_2, grad_2 = loss.loss_gradient(
|
||||
weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
|
||||
)
|
||||
grad_2 = grad_2[:, :-1].T
|
||||
# convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
|
||||
loss_2 *= np.sum(sample_weights)
|
||||
grad_2 *= np.sum(sample_weights)
|
||||
|
||||
assert_almost_equal(loss_1, loss_2)
|
||||
assert_array_almost_equal(grad_1, grad_2)
|
||||
|
||||
# ground truth
|
||||
loss_gt = 11.680360354325961
|
||||
grad_gt = np.array(
|
||||
[[-0.557487, -1.619151, +2.176638], [-0.903942, +5.258745, -4.354803]]
|
||||
)
|
||||
assert_almost_equal(loss_1, loss_gt)
|
||||
assert_array_almost_equal(grad_1, grad_gt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("solver", ["sag", "saga"])
|
||||
def test_sag_classifier_raises_error(solver):
|
||||
# Following #13316, the error handling behavior changed in cython sag. This
|
||||
# is simply a non-regression test to make sure numerical errors are
|
||||
# properly raised.
|
||||
|
||||
# Train a classifier on a simple problem
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_classification(random_state=rng)
|
||||
clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
|
||||
clf.fit(X, y)
|
||||
|
||||
# Trigger a numerical error by:
|
||||
# - corrupting the fitted coefficients of the classifier
|
||||
# - fit it again starting from its current state thanks to warm_start
|
||||
clf.coef_[:] = np.nan
|
||||
|
||||
with pytest.raises(ValueError, match="Floating-point under-/overflow"):
|
||||
clf.fit(X, y)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,384 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import make_regression
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
|
||||
from sklearn.utils._testing import (
|
||||
assert_almost_equal,
|
||||
assert_array_almost_equal,
|
||||
create_memmap_backed_data,
|
||||
ignore_warnings,
|
||||
)
|
||||
from sklearn.utils.fixes import COO_CONTAINERS, CSC_CONTAINERS, LIL_CONTAINERS
|
||||
|
||||
|
||||
def test_sparse_coef():
|
||||
# Check that the sparse_coef property works
|
||||
clf = ElasticNet()
|
||||
clf.coef_ = [1, 2, 3]
|
||||
|
||||
assert sp.issparse(clf.sparse_coef_)
|
||||
assert clf.sparse_coef_.toarray().tolist()[0] == clf.coef_
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_lasso_zero(csc_container):
|
||||
# Check that the sparse lasso can handle zero data without crashing
|
||||
X = csc_container((3, 1))
|
||||
y = [0, 0, 0]
|
||||
T = np.array([[1], [2], [3]])
|
||||
clf = Lasso().fit(X, y)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [0])
|
||||
assert_array_almost_equal(pred, [0, 0, 0])
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("with_sample_weight", [True, False])
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_enet_toy_list_input(with_sample_weight, csc_container):
|
||||
# Test ElasticNet for various values of alpha and l1_ratio with list X
|
||||
|
||||
X = np.array([[-1], [0], [1]])
|
||||
X = csc_container(X)
|
||||
Y = [-1, 0, 1] # just a straight line
|
||||
T = np.array([[2], [3], [4]]) # test sample
|
||||
if with_sample_weight:
|
||||
sw = np.array([2.0, 2, 2])
|
||||
else:
|
||||
sw = None
|
||||
|
||||
# this should be the same as unregularized least squares
|
||||
clf = ElasticNet(alpha=0, l1_ratio=1.0)
|
||||
# catch warning about alpha=0.
|
||||
# this is discouraged but should work.
|
||||
ignore_warnings(clf.fit)(X, Y, sample_weight=sw)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [1])
|
||||
assert_array_almost_equal(pred, [2, 3, 4])
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
|
||||
clf.fit(X, Y, sample_weight=sw)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
|
||||
assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
|
||||
clf.fit(X, Y, sample_weight=sw)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [0.45454], 3)
|
||||
assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_enet_toy_explicit_sparse_input(lil_container):
|
||||
# Test ElasticNet for various values of alpha and l1_ratio with sparse X
|
||||
f = ignore_warnings
|
||||
# training samples
|
||||
X = lil_container((3, 1))
|
||||
X[0, 0] = -1
|
||||
# X[1, 0] = 0
|
||||
X[2, 0] = 1
|
||||
Y = [-1, 0, 1] # just a straight line (the identity function)
|
||||
|
||||
# test samples
|
||||
T = lil_container((3, 1))
|
||||
T[0, 0] = 2
|
||||
T[1, 0] = 3
|
||||
T[2, 0] = 4
|
||||
|
||||
# this should be the same as lasso
|
||||
clf = ElasticNet(alpha=0, l1_ratio=1.0)
|
||||
f(clf.fit)(X, Y)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [1])
|
||||
assert_array_almost_equal(pred, [2, 3, 4])
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
clf = ElasticNet(alpha=0.5, l1_ratio=0.3)
|
||||
clf.fit(X, Y)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
|
||||
assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3)
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
|
||||
clf.fit(X, Y)
|
||||
pred = clf.predict(T)
|
||||
assert_array_almost_equal(clf.coef_, [0.45454], 3)
|
||||
assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3)
|
||||
assert_almost_equal(clf.dual_gap_, 0)
|
||||
|
||||
|
||||
def make_sparse_data(
|
||||
sparse_container,
|
||||
n_samples=100,
|
||||
n_features=100,
|
||||
n_informative=10,
|
||||
seed=42,
|
||||
positive=False,
|
||||
n_targets=1,
|
||||
):
|
||||
random_state = np.random.RandomState(seed)
|
||||
|
||||
# build an ill-posed linear regression problem with many noisy features and
|
||||
# comparatively few samples
|
||||
|
||||
# generate a ground truth model
|
||||
w = random_state.randn(n_features, n_targets)
|
||||
w[n_informative:] = 0.0 # only the top features are impacting the model
|
||||
if positive:
|
||||
w = np.abs(w)
|
||||
|
||||
X = random_state.randn(n_samples, n_features)
|
||||
rnd = random_state.uniform(size=(n_samples, n_features))
|
||||
X[rnd > 0.5] = 0.0 # 50% of zeros in input signal
|
||||
|
||||
# generate training ground truth labels
|
||||
y = np.dot(X, w)
|
||||
X = sparse_container(X)
|
||||
if n_targets == 1:
|
||||
y = np.ravel(y)
|
||||
return X, y
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize(
|
||||
"alpha, fit_intercept, positive",
|
||||
[(0.1, False, False), (0.1, True, False), (1e-3, False, True), (1e-3, True, True)],
|
||||
)
|
||||
def test_sparse_enet_not_as_toy_dataset(csc_container, alpha, fit_intercept, positive):
|
||||
n_samples, n_features, max_iter = 100, 100, 1000
|
||||
n_informative = 10
|
||||
|
||||
X, y = make_sparse_data(
|
||||
csc_container, n_samples, n_features, n_informative, positive=positive
|
||||
)
|
||||
|
||||
X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
|
||||
y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
|
||||
|
||||
s_clf = ElasticNet(
|
||||
alpha=alpha,
|
||||
l1_ratio=0.8,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=max_iter,
|
||||
tol=1e-7,
|
||||
positive=positive,
|
||||
warm_start=True,
|
||||
)
|
||||
s_clf.fit(X_train, y_train)
|
||||
|
||||
assert_almost_equal(s_clf.dual_gap_, 0, 4)
|
||||
assert s_clf.score(X_test, y_test) > 0.85
|
||||
|
||||
# check the convergence is the same as the dense version
|
||||
d_clf = ElasticNet(
|
||||
alpha=alpha,
|
||||
l1_ratio=0.8,
|
||||
fit_intercept=fit_intercept,
|
||||
max_iter=max_iter,
|
||||
tol=1e-7,
|
||||
positive=positive,
|
||||
warm_start=True,
|
||||
)
|
||||
d_clf.fit(X_train.toarray(), y_train)
|
||||
|
||||
assert_almost_equal(d_clf.dual_gap_, 0, 4)
|
||||
assert d_clf.score(X_test, y_test) > 0.85
|
||||
|
||||
assert_almost_equal(s_clf.coef_, d_clf.coef_, 5)
|
||||
assert_almost_equal(s_clf.intercept_, d_clf.intercept_, 5)
|
||||
|
||||
# check that the coefs are sparse
|
||||
assert np.sum(s_clf.coef_ != 0.0) < 2 * n_informative
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_sparse_lasso_not_as_toy_dataset(csc_container):
|
||||
n_samples = 100
|
||||
max_iter = 1000
|
||||
n_informative = 10
|
||||
X, y = make_sparse_data(
|
||||
csc_container, n_samples=n_samples, n_informative=n_informative
|
||||
)
|
||||
|
||||
X_train, X_test = X[n_samples // 2 :], X[: n_samples // 2]
|
||||
y_train, y_test = y[n_samples // 2 :], y[: n_samples // 2]
|
||||
|
||||
s_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
|
||||
s_clf.fit(X_train, y_train)
|
||||
assert_almost_equal(s_clf.dual_gap_, 0, 4)
|
||||
assert s_clf.score(X_test, y_test) > 0.85
|
||||
|
||||
# check the convergence is the same as the dense version
|
||||
d_clf = Lasso(alpha=0.1, fit_intercept=False, max_iter=max_iter, tol=1e-7)
|
||||
d_clf.fit(X_train.toarray(), y_train)
|
||||
assert_almost_equal(d_clf.dual_gap_, 0, 4)
|
||||
assert d_clf.score(X_test, y_test) > 0.85
|
||||
|
||||
# check that the coefs are sparse
|
||||
assert np.sum(s_clf.coef_ != 0.0) == n_informative
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_enet_multitarget(csc_container):
|
||||
n_targets = 3
|
||||
X, y = make_sparse_data(csc_container, n_targets=n_targets)
|
||||
|
||||
estimator = ElasticNet(alpha=0.01, precompute=False)
|
||||
# XXX: There is a bug when precompute is not False!
|
||||
estimator.fit(X, y)
|
||||
coef, intercept, dual_gap = (
|
||||
estimator.coef_,
|
||||
estimator.intercept_,
|
||||
estimator.dual_gap_,
|
||||
)
|
||||
|
||||
for k in range(n_targets):
|
||||
estimator.fit(X, y[:, k])
|
||||
assert_array_almost_equal(coef[k, :], estimator.coef_)
|
||||
assert_array_almost_equal(intercept[k], estimator.intercept_)
|
||||
assert_array_almost_equal(dual_gap[k], estimator.dual_gap_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_path_parameters(csc_container):
|
||||
X, y = make_sparse_data(csc_container)
|
||||
max_iter = 50
|
||||
n_alphas = 10
|
||||
clf = ElasticNetCV(
|
||||
n_alphas=n_alphas,
|
||||
eps=1e-3,
|
||||
max_iter=max_iter,
|
||||
l1_ratio=0.5,
|
||||
fit_intercept=False,
|
||||
)
|
||||
ignore_warnings(clf.fit)(X, y) # new params
|
||||
assert_almost_equal(0.5, clf.l1_ratio)
|
||||
assert n_alphas == clf.n_alphas
|
||||
assert n_alphas == len(clf.alphas_)
|
||||
sparse_mse_path = clf.mse_path_
|
||||
ignore_warnings(clf.fit)(X.toarray(), y) # compare with dense data
|
||||
assert_almost_equal(clf.mse_path_, sparse_mse_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Model", [Lasso, ElasticNet, LassoCV, ElasticNetCV])
|
||||
@pytest.mark.parametrize("fit_intercept", [False, True])
|
||||
@pytest.mark.parametrize("n_samples, n_features", [(24, 6), (6, 24)])
|
||||
@pytest.mark.parametrize("with_sample_weight", [True, False])
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_sparse_dense_equality(
|
||||
Model, fit_intercept, n_samples, n_features, with_sample_weight, csc_container
|
||||
):
|
||||
X, y = make_regression(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
effective_rank=n_features // 2,
|
||||
n_informative=n_features // 2,
|
||||
bias=4 * fit_intercept,
|
||||
noise=1,
|
||||
random_state=42,
|
||||
)
|
||||
if with_sample_weight:
|
||||
sw = np.abs(np.random.RandomState(42).normal(scale=10, size=y.shape))
|
||||
else:
|
||||
sw = None
|
||||
Xs = csc_container(X)
|
||||
params = {"fit_intercept": fit_intercept}
|
||||
reg_dense = Model(**params).fit(X, y, sample_weight=sw)
|
||||
reg_sparse = Model(**params).fit(Xs, y, sample_weight=sw)
|
||||
if fit_intercept:
|
||||
assert reg_sparse.intercept_ == pytest.approx(reg_dense.intercept_)
|
||||
# balance property
|
||||
assert np.average(reg_sparse.predict(X), weights=sw) == pytest.approx(
|
||||
np.average(y, weights=sw)
|
||||
)
|
||||
assert_allclose(reg_sparse.coef_, reg_dense.coef_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_same_output_sparse_dense_lasso_and_enet_cv(csc_container):
|
||||
X, y = make_sparse_data(csc_container, n_samples=40, n_features=10)
|
||||
clfs = ElasticNetCV(max_iter=100)
|
||||
clfs.fit(X, y)
|
||||
clfd = ElasticNetCV(max_iter=100)
|
||||
clfd.fit(X.toarray(), y)
|
||||
assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
|
||||
assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
|
||||
assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
|
||||
assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
|
||||
|
||||
clfs = LassoCV(max_iter=100, cv=4)
|
||||
clfs.fit(X, y)
|
||||
clfd = LassoCV(max_iter=100, cv=4)
|
||||
clfd.fit(X.toarray(), y)
|
||||
assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
|
||||
assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
|
||||
assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
|
||||
assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_same_multiple_output_sparse_dense(coo_container):
|
||||
l = ElasticNet()
|
||||
X = [
|
||||
[0, 1, 2, 3, 4],
|
||||
[0, 2, 5, 8, 11],
|
||||
[9, 10, 11, 12, 13],
|
||||
[10, 11, 12, 13, 14],
|
||||
]
|
||||
y = [
|
||||
[1, 2, 3, 4, 5],
|
||||
[1, 3, 6, 9, 12],
|
||||
[10, 11, 12, 13, 14],
|
||||
[11, 12, 13, 14, 15],
|
||||
]
|
||||
l.fit(X, y)
|
||||
sample = np.array([1, 2, 3, 4, 5]).reshape(1, -1)
|
||||
predict_dense = l.predict(sample)
|
||||
|
||||
l_sp = ElasticNet()
|
||||
X_sp = coo_container(X)
|
||||
l_sp.fit(X_sp, y)
|
||||
sample_sparse = coo_container(sample)
|
||||
predict_sparse = l_sp.predict(sample_sparse)
|
||||
|
||||
assert_array_almost_equal(predict_sparse, predict_dense)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_sparse_enet_coordinate_descent(csc_container):
|
||||
"""Test that a warning is issued if model does not converge"""
|
||||
clf = Lasso(max_iter=2)
|
||||
n_samples = 5
|
||||
n_features = 2
|
||||
X = csc_container((n_samples, n_features)) * 1e50
|
||||
y = np.ones(n_samples)
|
||||
warning_message = (
|
||||
"Objective did not converge. You might want "
|
||||
"to increase the number of iterations."
|
||||
)
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("copy_X", (True, False))
|
||||
def test_sparse_read_only_buffer(copy_X):
|
||||
"""Test that sparse coordinate descent works for read-only buffers"""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
clf = ElasticNet(alpha=0.1, copy_X=copy_X, random_state=rng)
|
||||
X = sp.random(100, 20, format="csc", random_state=rng)
|
||||
|
||||
# Make X.data read-only
|
||||
X.data = create_memmap_backed_data(X.data)
|
||||
|
||||
y = rng.rand(100)
|
||||
clf.fit(X, y)
|
||||
@@ -0,0 +1,294 @@
|
||||
"""
|
||||
Testing for Theil-Sen module (sklearn.linear_model.theil_sen)
|
||||
"""
|
||||
|
||||
# Author: Florian Wilhelm <florian.wilhelm@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import (
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
assert_array_less,
|
||||
)
|
||||
from scipy.linalg import norm
|
||||
from scipy.optimize import fmin_bfgs
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import LinearRegression, TheilSenRegressor
|
||||
from sklearn.linear_model._theil_sen import (
|
||||
_breakdown_point,
|
||||
_modified_weiszfeld_step,
|
||||
_spatial_median,
|
||||
)
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
|
||||
|
||||
@contextmanager
|
||||
def no_stdout_stderr():
|
||||
old_stdout = sys.stdout
|
||||
old_stderr = sys.stderr
|
||||
with open(os.devnull, "w") as devnull:
|
||||
sys.stdout = devnull
|
||||
sys.stderr = devnull
|
||||
yield
|
||||
devnull.flush()
|
||||
sys.stdout = old_stdout
|
||||
sys.stderr = old_stderr
|
||||
|
||||
|
||||
def gen_toy_problem_1d(intercept=True):
|
||||
random_state = np.random.RandomState(0)
|
||||
# Linear model y = 3*x + N(2, 0.1**2)
|
||||
w = 3.0
|
||||
if intercept:
|
||||
c = 2.0
|
||||
n_samples = 50
|
||||
else:
|
||||
c = 0.1
|
||||
n_samples = 100
|
||||
x = random_state.normal(size=n_samples)
|
||||
noise = 0.1 * random_state.normal(size=n_samples)
|
||||
y = w * x + c + noise
|
||||
# Add some outliers
|
||||
if intercept:
|
||||
x[42], y[42] = (-2, 4)
|
||||
x[43], y[43] = (-2.5, 8)
|
||||
x[33], y[33] = (2.5, 1)
|
||||
x[49], y[49] = (2.1, 2)
|
||||
else:
|
||||
x[42], y[42] = (-2, 4)
|
||||
x[43], y[43] = (-2.5, 8)
|
||||
x[53], y[53] = (2.5, 1)
|
||||
x[60], y[60] = (2.1, 2)
|
||||
x[72], y[72] = (1.8, -7)
|
||||
return x[:, np.newaxis], y, w, c
|
||||
|
||||
|
||||
def gen_toy_problem_2d():
|
||||
random_state = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
# Linear model y = 5*x_1 + 10*x_2 + N(1, 0.1**2)
|
||||
X = random_state.normal(size=(n_samples, 2))
|
||||
w = np.array([5.0, 10.0])
|
||||
c = 1.0
|
||||
noise = 0.1 * random_state.normal(size=n_samples)
|
||||
y = np.dot(X, w) + c + noise
|
||||
# Add some outliers
|
||||
n_outliers = n_samples // 10
|
||||
ix = random_state.randint(0, n_samples, size=n_outliers)
|
||||
y[ix] = 50 * random_state.normal(size=n_outliers)
|
||||
return X, y, w, c
|
||||
|
||||
|
||||
def gen_toy_problem_4d():
|
||||
random_state = np.random.RandomState(0)
|
||||
n_samples = 10000
|
||||
# Linear model y = 5*x_1 + 10*x_2 + 42*x_3 + 7*x_4 + N(1, 0.1**2)
|
||||
X = random_state.normal(size=(n_samples, 4))
|
||||
w = np.array([5.0, 10.0, 42.0, 7.0])
|
||||
c = 1.0
|
||||
noise = 0.1 * random_state.normal(size=n_samples)
|
||||
y = np.dot(X, w) + c + noise
|
||||
# Add some outliers
|
||||
n_outliers = n_samples // 10
|
||||
ix = random_state.randint(0, n_samples, size=n_outliers)
|
||||
y[ix] = 50 * random_state.normal(size=n_outliers)
|
||||
return X, y, w, c
|
||||
|
||||
|
||||
def test_modweiszfeld_step_1d():
|
||||
X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
|
||||
# Check startvalue is element of X and solution
|
||||
median = 2.0
|
||||
new_y = _modified_weiszfeld_step(X, median)
|
||||
assert_array_almost_equal(new_y, median)
|
||||
# Check startvalue is not the solution
|
||||
y = 2.5
|
||||
new_y = _modified_weiszfeld_step(X, y)
|
||||
assert_array_less(median, new_y)
|
||||
assert_array_less(new_y, y)
|
||||
# Check startvalue is not the solution but element of X
|
||||
y = 3.0
|
||||
new_y = _modified_weiszfeld_step(X, y)
|
||||
assert_array_less(median, new_y)
|
||||
assert_array_less(new_y, y)
|
||||
# Check that a single vector is identity
|
||||
X = np.array([1.0, 2.0, 3.0]).reshape(1, 3)
|
||||
y = X[0]
|
||||
new_y = _modified_weiszfeld_step(X, y)
|
||||
assert_array_equal(y, new_y)
|
||||
|
||||
|
||||
def test_modweiszfeld_step_2d():
|
||||
X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
|
||||
y = np.array([0.5, 0.5])
|
||||
# Check first two iterations
|
||||
new_y = _modified_weiszfeld_step(X, y)
|
||||
assert_array_almost_equal(new_y, np.array([1 / 3, 2 / 3]))
|
||||
new_y = _modified_weiszfeld_step(X, new_y)
|
||||
assert_array_almost_equal(new_y, np.array([0.2792408, 0.7207592]))
|
||||
# Check fix point
|
||||
y = np.array([0.21132505, 0.78867497])
|
||||
new_y = _modified_weiszfeld_step(X, y)
|
||||
assert_array_almost_equal(new_y, y)
|
||||
|
||||
|
||||
def test_spatial_median_1d():
|
||||
X = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
|
||||
true_median = 2.0
|
||||
_, median = _spatial_median(X)
|
||||
assert_array_almost_equal(median, true_median)
|
||||
# Test larger problem and for exact solution in 1d case
|
||||
random_state = np.random.RandomState(0)
|
||||
X = random_state.randint(100, size=(1000, 1))
|
||||
true_median = np.median(X.ravel())
|
||||
_, median = _spatial_median(X)
|
||||
assert_array_equal(median, true_median)
|
||||
|
||||
|
||||
def test_spatial_median_2d():
|
||||
X = np.array([0.0, 0.0, 1.0, 1.0, 0.0, 1.0]).reshape(3, 2)
|
||||
_, median = _spatial_median(X, max_iter=100, tol=1.0e-6)
|
||||
|
||||
def cost_func(y):
|
||||
dists = np.array([norm(x - y) for x in X])
|
||||
return np.sum(dists)
|
||||
|
||||
# Check if median is solution of the Fermat-Weber location problem
|
||||
fermat_weber = fmin_bfgs(cost_func, median, disp=False)
|
||||
assert_array_almost_equal(median, fermat_weber)
|
||||
# Check when maximum iteration is exceeded a warning is emitted
|
||||
warning_message = "Maximum number of iterations 30 reached in spatial median."
|
||||
with pytest.warns(ConvergenceWarning, match=warning_message):
|
||||
_spatial_median(X, max_iter=30, tol=0.0)
|
||||
|
||||
|
||||
def test_theil_sen_1d():
|
||||
X, y, w, c = gen_toy_problem_1d()
|
||||
# Check that Least Squares fails
|
||||
lstq = LinearRegression().fit(X, y)
|
||||
assert np.abs(lstq.coef_ - w) > 0.9
|
||||
# Check that Theil-Sen works
|
||||
theil_sen = TheilSenRegressor(random_state=0).fit(X, y)
|
||||
assert_array_almost_equal(theil_sen.coef_, w, 1)
|
||||
assert_array_almost_equal(theil_sen.intercept_, c, 1)
|
||||
|
||||
|
||||
def test_theil_sen_1d_no_intercept():
|
||||
X, y, w, c = gen_toy_problem_1d(intercept=False)
|
||||
# Check that Least Squares fails
|
||||
lstq = LinearRegression(fit_intercept=False).fit(X, y)
|
||||
assert np.abs(lstq.coef_ - w - c) > 0.5
|
||||
# Check that Theil-Sen works
|
||||
theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
|
||||
assert_array_almost_equal(theil_sen.coef_, w + c, 1)
|
||||
assert_almost_equal(theil_sen.intercept_, 0.0)
|
||||
|
||||
# non-regression test for #18104
|
||||
theil_sen.score(X, y)
|
||||
|
||||
|
||||
def test_theil_sen_2d():
|
||||
X, y, w, c = gen_toy_problem_2d()
|
||||
# Check that Least Squares fails
|
||||
lstq = LinearRegression().fit(X, y)
|
||||
assert norm(lstq.coef_ - w) > 1.0
|
||||
# Check that Theil-Sen works
|
||||
theil_sen = TheilSenRegressor(max_subpopulation=1e3, random_state=0).fit(X, y)
|
||||
assert_array_almost_equal(theil_sen.coef_, w, 1)
|
||||
assert_array_almost_equal(theil_sen.intercept_, c, 1)
|
||||
|
||||
|
||||
def test_calc_breakdown_point():
|
||||
bp = _breakdown_point(1e10, 2)
|
||||
assert np.abs(bp - 1 + 1 / (np.sqrt(2))) < 1.0e-6
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"param, ExceptionCls, match",
|
||||
[
|
||||
(
|
||||
{"n_subsamples": 1},
|
||||
ValueError,
|
||||
re.escape("Invalid parameter since n_features+1 > n_subsamples (2 > 1)"),
|
||||
),
|
||||
(
|
||||
{"n_subsamples": 101},
|
||||
ValueError,
|
||||
re.escape("Invalid parameter since n_subsamples > n_samples (101 > 50)"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_checksubparams_invalid_input(param, ExceptionCls, match):
|
||||
X, y, w, c = gen_toy_problem_1d()
|
||||
theil_sen = TheilSenRegressor(**param, random_state=0)
|
||||
with pytest.raises(ExceptionCls, match=match):
|
||||
theil_sen.fit(X, y)
|
||||
|
||||
|
||||
def test_checksubparams_n_subsamples_if_less_samples_than_features():
|
||||
random_state = np.random.RandomState(0)
|
||||
n_samples, n_features = 10, 20
|
||||
X = random_state.normal(size=(n_samples, n_features))
|
||||
y = random_state.normal(size=n_samples)
|
||||
theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0)
|
||||
with pytest.raises(ValueError):
|
||||
theil_sen.fit(X, y)
|
||||
|
||||
|
||||
def test_subpopulation():
|
||||
X, y, w, c = gen_toy_problem_4d()
|
||||
theil_sen = TheilSenRegressor(max_subpopulation=250, random_state=0).fit(X, y)
|
||||
assert_array_almost_equal(theil_sen.coef_, w, 1)
|
||||
assert_array_almost_equal(theil_sen.intercept_, c, 1)
|
||||
|
||||
|
||||
def test_subsamples():
|
||||
X, y, w, c = gen_toy_problem_4d()
|
||||
theil_sen = TheilSenRegressor(n_subsamples=X.shape[0], random_state=0).fit(X, y)
|
||||
lstq = LinearRegression().fit(X, y)
|
||||
# Check for exact the same results as Least Squares
|
||||
assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 9)
|
||||
|
||||
|
||||
def test_verbosity():
|
||||
X, y, w, c = gen_toy_problem_1d()
|
||||
# Check that Theil-Sen can be verbose
|
||||
with no_stdout_stderr():
|
||||
TheilSenRegressor(verbose=True, random_state=0).fit(X, y)
|
||||
TheilSenRegressor(verbose=True, max_subpopulation=10, random_state=0).fit(X, y)
|
||||
|
||||
|
||||
def test_theil_sen_parallel():
|
||||
X, y, w, c = gen_toy_problem_2d()
|
||||
# Check that Least Squares fails
|
||||
lstq = LinearRegression().fit(X, y)
|
||||
assert norm(lstq.coef_ - w) > 1.0
|
||||
# Check that Theil-Sen works
|
||||
theil_sen = TheilSenRegressor(n_jobs=2, random_state=0, max_subpopulation=2e3).fit(
|
||||
X, y
|
||||
)
|
||||
assert_array_almost_equal(theil_sen.coef_, w, 1)
|
||||
assert_array_almost_equal(theil_sen.intercept_, c, 1)
|
||||
|
||||
|
||||
def test_less_samples_than_features():
|
||||
random_state = np.random.RandomState(0)
|
||||
n_samples, n_features = 10, 20
|
||||
X = random_state.normal(size=(n_samples, n_features))
|
||||
y = random_state.normal(size=n_samples)
|
||||
# Check that Theil-Sen falls back to Least Squares if fit_intercept=False
|
||||
theil_sen = TheilSenRegressor(fit_intercept=False, random_state=0).fit(X, y)
|
||||
lstq = LinearRegression(fit_intercept=False).fit(X, y)
|
||||
assert_array_almost_equal(theil_sen.coef_, lstq.coef_, 12)
|
||||
# Check fit_intercept=True case. This will not be equal to the Least
|
||||
# Squares solution since the intercept is calculated differently.
|
||||
theil_sen = TheilSenRegressor(fit_intercept=True, random_state=0).fit(X, y)
|
||||
y_pred = theil_sen.predict(X)
|
||||
assert_array_almost_equal(y_pred, y, 12)
|
||||
Reference in New Issue
Block a user