library packages
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,16 @@
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._arpack import _init_arpack_v0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("seed", range(100))
|
||||
def test_init_arpack_v0(seed):
|
||||
# check that the initialization a sampling from an uniform distribution
|
||||
# where we can fix the random state
|
||||
size = 1000
|
||||
v0 = _init_arpack_v0(size, seed)
|
||||
|
||||
rng = check_random_state(seed)
|
||||
assert_allclose(v0, rng.uniform(-1, 1, size=size))
|
||||
@@ -0,0 +1,580 @@
|
||||
import re
|
||||
from functools import partial
|
||||
|
||||
import numpy
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn._config import config_context
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils._array_api import (
|
||||
_ArrayAPIWrapper,
|
||||
_asarray_with_order,
|
||||
_atol_for_type,
|
||||
_average,
|
||||
_convert_to_numpy,
|
||||
_count_nonzero,
|
||||
_estimator_with_converted_arrays,
|
||||
_is_numpy_namespace,
|
||||
_nanmax,
|
||||
_nanmin,
|
||||
_NumPyAPIWrapper,
|
||||
_ravel,
|
||||
device,
|
||||
get_namespace,
|
||||
get_namespace_and_device,
|
||||
indexing_dtype,
|
||||
supported_float_dtypes,
|
||||
yield_namespace_device_dtype_combinations,
|
||||
)
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
skip_if_array_api_compat_not_configured,
|
||||
)
|
||||
from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, np_version, parse_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
|
||||
def test_get_namespace_ndarray_default(X):
|
||||
"""Check that get_namespace returns NumPy wrapper"""
|
||||
xp_out, is_array_api_compliant = get_namespace(X)
|
||||
assert isinstance(xp_out, _NumPyAPIWrapper)
|
||||
assert not is_array_api_compliant
|
||||
|
||||
|
||||
def test_get_namespace_ndarray_creation_device():
|
||||
"""Check expected behavior with device and creation functions."""
|
||||
X = numpy.asarray([1, 2, 3])
|
||||
xp_out, _ = get_namespace(X)
|
||||
|
||||
full_array = xp_out.full(10, fill_value=2.0, device="cpu")
|
||||
assert_allclose(full_array, [2.0] * 10)
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported device"):
|
||||
xp_out.zeros(10, device="cuda")
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_get_namespace_ndarray_with_dispatch():
|
||||
"""Test get_namespace on NumPy ndarrays."""
|
||||
array_api_compat = pytest.importorskip("array_api_compat")
|
||||
|
||||
X_np = numpy.asarray([[1, 2, 3]])
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_np)
|
||||
assert is_array_api_compliant
|
||||
if np_version >= parse_version("2.0.0"):
|
||||
# NumPy 2.0+ is an array API compliant library.
|
||||
assert xp_out is numpy
|
||||
else:
|
||||
# Older NumPy versions require the compatibility layer.
|
||||
assert xp_out is array_api_compat.numpy
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_get_namespace_array_api():
|
||||
"""Test get_namespace for ArrayAPI arrays."""
|
||||
xp = pytest.importorskip("array_api_strict")
|
||||
|
||||
X_np = numpy.asarray([[1, 2, 3]])
|
||||
X_xp = xp.asarray(X_np)
|
||||
with config_context(array_api_dispatch=True):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_xp)
|
||||
assert is_array_api_compliant
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
|
||||
|
||||
|
||||
class _AdjustableNameAPITestWrapper(_ArrayAPIWrapper):
|
||||
"""API wrapper that has an adjustable name. Used for testing."""
|
||||
|
||||
def __init__(self, array_namespace, name):
|
||||
super().__init__(array_namespace=array_namespace)
|
||||
self.__name__ = name
|
||||
|
||||
|
||||
def test_array_api_wrapper_astype():
|
||||
"""Test _ArrayAPIWrapper for ArrayAPIs that is not NumPy."""
|
||||
array_api_strict = pytest.importorskip("array_api_strict")
|
||||
xp_ = _AdjustableNameAPITestWrapper(array_api_strict, "array_api_strict")
|
||||
xp = _ArrayAPIWrapper(xp_)
|
||||
|
||||
X = xp.asarray(([[1, 2, 3], [3, 4, 5]]), dtype=xp.float64)
|
||||
X_converted = xp.astype(X, xp.float32)
|
||||
assert X_converted.dtype == xp.float32
|
||||
|
||||
X_converted = xp.asarray(X, dtype=xp.float32)
|
||||
assert X_converted.dtype == xp.float32
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
|
||||
def test_asarray_with_order(array_api):
|
||||
"""Test _asarray_with_order passes along order for NumPy arrays."""
|
||||
xp = pytest.importorskip(array_api)
|
||||
|
||||
X = xp.asarray([1.2, 3.4, 5.1])
|
||||
X_new = _asarray_with_order(X, order="F", xp=xp)
|
||||
|
||||
X_new_np = numpy.asarray(X_new)
|
||||
assert X_new_np.flags["F_CONTIGUOUS"]
|
||||
|
||||
|
||||
def test_asarray_with_order_ignored():
|
||||
"""Test _asarray_with_order ignores order for Generic ArrayAPI."""
|
||||
xp = pytest.importorskip("array_api_strict")
|
||||
xp_ = _AdjustableNameAPITestWrapper(xp, "array_api_strict")
|
||||
|
||||
X = numpy.asarray([[1.2, 3.4, 5.1], [3.4, 5.5, 1.2]], order="C")
|
||||
X = xp_.asarray(X)
|
||||
|
||||
X_new = _asarray_with_order(X, order="F", xp=xp_)
|
||||
|
||||
X_new_np = numpy.asarray(X_new)
|
||||
assert X_new_np.flags["C_CONTIGUOUS"]
|
||||
assert not X_new_np.flags["F_CONTIGUOUS"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"weights, axis, normalize, expected",
|
||||
[
|
||||
# normalize = True
|
||||
(None, None, True, 3.5),
|
||||
(None, 0, True, [2.5, 3.5, 4.5]),
|
||||
(None, 1, True, [2, 5]),
|
||||
([True, False], 0, True, [1, 2, 3]), # boolean weights
|
||||
([True, True, False], 1, True, [1.5, 4.5]), # boolean weights
|
||||
([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
|
||||
([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
|
||||
([1, 2], 0, True, [3, 4, 5]),
|
||||
([1, 1, 2], 1, True, [2.25, 5.25]),
|
||||
([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
|
||||
([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
|
||||
# normalize = False
|
||||
(None, None, False, 21),
|
||||
(None, 0, False, [5, 7, 9]),
|
||||
(None, 1, False, [6, 15]),
|
||||
([True, False], 0, False, [1, 2, 3]), # boolean weights
|
||||
([True, True, False], 1, False, [3, 9]), # boolean weights
|
||||
([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
|
||||
([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
|
||||
([1, 2], 0, False, [9, 12, 15]),
|
||||
([1, 1, 2], 1, False, [9, 21]),
|
||||
([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
|
||||
([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
|
||||
],
|
||||
)
|
||||
def test_average(
|
||||
array_namespace, device_, dtype_name, weights, axis, normalize, expected
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
|
||||
array_in = xp.asarray(array_in, device=device_)
|
||||
if weights is not None:
|
||||
weights = numpy.asarray(weights, dtype=dtype_name)
|
||||
weights = xp.asarray(weights, device=device_)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
|
||||
|
||||
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
|
||||
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
|
||||
# https://github.com/numpy/numpy/issues/26850
|
||||
assert device(array_in) == device(result)
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
|
||||
)
|
||||
def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
|
||||
[4, 3], dtype=dtype_name
|
||||
)
|
||||
complex_type_name = array_in.dtype.name
|
||||
if not hasattr(xp, complex_type_name):
|
||||
# This is the case for cupy as of March 2024 for instance.
|
||||
pytest.skip(f"{array_namespace} does not support {complex_type_name}")
|
||||
|
||||
array_in = xp.asarray(array_in, device=device)
|
||||
|
||||
err_msg = "Complex floating point values are not supported by average."
|
||||
with (
|
||||
config_context(array_api_dispatch=True),
|
||||
pytest.raises(NotImplementedError, match=err_msg),
|
||||
):
|
||||
_average(array_in)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"axis, weights, error, error_msg",
|
||||
(
|
||||
(
|
||||
None,
|
||||
[1, 2],
|
||||
TypeError,
|
||||
"Axis must be specified",
|
||||
),
|
||||
(
|
||||
0,
|
||||
[[1, 2]],
|
||||
# NumPy 2 raises ValueError, NumPy 1 raises TypeError
|
||||
(ValueError, TypeError),
|
||||
"weights", # the message is different for NumPy 1 and 2...
|
||||
),
|
||||
(
|
||||
0,
|
||||
[1, 2, 3, 4],
|
||||
ValueError,
|
||||
"weights",
|
||||
),
|
||||
(0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
|
||||
),
|
||||
)
|
||||
def test_average_raises_with_invalid_parameters(
|
||||
array_namespace, device, dtype_name, axis, weights, error, error_msg
|
||||
):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
|
||||
array_in = xp.asarray(array_in, device=device)
|
||||
|
||||
weights = numpy.asarray(weights, dtype=dtype_name)
|
||||
weights = xp.asarray(weights, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
|
||||
_average(array_in, axis=axis, weights=weights)
|
||||
|
||||
|
||||
def test_device_raises_if_no_input():
|
||||
err_msg = re.escape(
|
||||
"At least one input array expected after filtering with remove_none=True, "
|
||||
"remove_types=[str]. Got none. Original types: []."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
device()
|
||||
|
||||
err_msg = re.escape(
|
||||
"At least one input array expected after filtering with remove_none=True, "
|
||||
"remove_types=[str]. Got none. Original types: [NoneType, str]."
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
device(None, "name")
|
||||
|
||||
|
||||
def test_device_inspection():
|
||||
class Device:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
def __eq__(self, device):
|
||||
return self.name == device.name
|
||||
|
||||
def __hash__(self):
|
||||
raise TypeError("Device object is not hashable")
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
class Array:
|
||||
def __init__(self, device_name):
|
||||
self.device = Device(device_name)
|
||||
|
||||
# Sanity check: ensure our Device mock class is non hashable, to
|
||||
# accurately account for non-hashable device objects in some array
|
||||
# libraries, because of which the `device` inspection function should'nt
|
||||
# make use of hash lookup tables (in particular, not use `set`)
|
||||
with pytest.raises(TypeError):
|
||||
hash(Array("device").device)
|
||||
|
||||
# Test raise if on different devices
|
||||
err_msg = "Input arrays use different devices: cpu, mygpu"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
device(Array("cpu"), Array("mygpu"))
|
||||
|
||||
# Test expected value is returned otherwise
|
||||
array1 = Array("device")
|
||||
array2 = Array("device")
|
||||
|
||||
assert array1.device == device(array1)
|
||||
assert array1.device == device(array1, array2)
|
||||
assert array1.device == device(array1, array1, array2)
|
||||
|
||||
|
||||
# TODO: add cupy and cupy.array_api to the list of libraries once the
|
||||
# the following upstream issue has been fixed:
|
||||
# https://github.com/cupy/cupy/issues/8180
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
|
||||
@pytest.mark.parametrize(
|
||||
"X,reduction,expected",
|
||||
[
|
||||
([1, 2, numpy.nan], _nanmin, 1),
|
||||
([1, -2, -numpy.nan], _nanmin, -2),
|
||||
([numpy.inf, numpy.inf], _nanmin, numpy.inf),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmin, axis=0),
|
||||
[1.0, 2.0, 3.0],
|
||||
),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmin, axis=1),
|
||||
[1.0, numpy.nan, 4.0],
|
||||
),
|
||||
([1, 2, numpy.nan], _nanmax, 2),
|
||||
([1, 2, numpy.nan], _nanmax, 2),
|
||||
([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmax, axis=0),
|
||||
[4.0, 5.0, 6.0],
|
||||
),
|
||||
(
|
||||
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
|
||||
partial(_nanmax, axis=1),
|
||||
[3.0, numpy.nan, 6.0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_nan_reductions(library, X, reduction, expected):
|
||||
"""Check NaN reductions like _nanmin and _nanmax"""
|
||||
xp = pytest.importorskip(library)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = reduction(xp.asarray(X))
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
assert_allclose(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
def test_ravel(namespace, _device, _dtype):
|
||||
xp = _array_api_for_tests(namespace, _device)
|
||||
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
|
||||
array_xp = xp.asarray(array, device=_device)
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _ravel(array_xp)
|
||||
|
||||
result = _convert_to_numpy(result, xp)
|
||||
expected = numpy.ravel(array, order="C")
|
||||
|
||||
assert_allclose(expected, result)
|
||||
|
||||
if _is_numpy_namespace(xp):
|
||||
assert numpy.asarray(result).flags["C_CONTIGUOUS"]
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize("library", ["cupy", "torch", "cupy.array_api"])
|
||||
def test_convert_to_numpy_gpu(library): # pragma: nocover
|
||||
"""Check convert_to_numpy for GPU backed libraries."""
|
||||
xp = pytest.importorskip(library)
|
||||
|
||||
if library == "torch":
|
||||
if not xp.backends.cuda.is_built():
|
||||
pytest.skip("test requires cuda")
|
||||
X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
|
||||
else:
|
||||
X_gpu = xp.asarray([1.0, 2.0, 3.0])
|
||||
|
||||
X_cpu = _convert_to_numpy(X_gpu, xp=xp)
|
||||
expected_output = numpy.asarray([1.0, 2.0, 3.0])
|
||||
assert_allclose(X_cpu, expected_output)
|
||||
|
||||
|
||||
def test_convert_to_numpy_cpu():
|
||||
"""Check convert_to_numpy for PyTorch CPU arrays."""
|
||||
torch = pytest.importorskip("torch")
|
||||
X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
|
||||
|
||||
X_cpu = _convert_to_numpy(X_torch, xp=torch)
|
||||
expected_output = numpy.asarray([1.0, 2.0, 3.0])
|
||||
assert_allclose(X_cpu, expected_output)
|
||||
|
||||
|
||||
class SimpleEstimator(BaseEstimator):
|
||||
def fit(self, X, y=None):
|
||||
self.X_ = X
|
||||
self.n_features_ = X.shape[0]
|
||||
return self
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, converter",
|
||||
[
|
||||
("torch", lambda array: array.cpu().numpy()),
|
||||
("array_api_strict", lambda array: numpy.asarray(array)),
|
||||
("cupy.array_api", lambda array: array._array.get()),
|
||||
],
|
||||
)
|
||||
def test_convert_estimator_to_ndarray(array_namespace, converter):
|
||||
"""Convert estimator attributes to ndarray."""
|
||||
xp = pytest.importorskip(array_namespace)
|
||||
|
||||
X = xp.asarray([[1.3, 4.5]])
|
||||
est = SimpleEstimator().fit(X)
|
||||
|
||||
new_est = _estimator_with_converted_arrays(est, converter)
|
||||
assert isinstance(new_est.X_, numpy.ndarray)
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
def test_convert_estimator_to_array_api():
|
||||
"""Convert estimator attributes to ArrayAPI arrays."""
|
||||
xp = pytest.importorskip("array_api_strict")
|
||||
|
||||
X_np = numpy.asarray([[1.3, 4.5]])
|
||||
est = SimpleEstimator().fit(X_np)
|
||||
|
||||
new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
|
||||
assert hasattr(new_est.X_, "__array_namespace__")
|
||||
|
||||
|
||||
def test_reshape_behavior():
|
||||
"""Check reshape behavior with copy and is strict with non-tuple shape."""
|
||||
xp = _NumPyAPIWrapper()
|
||||
X = xp.asarray([[1, 2, 3], [3, 4, 5]])
|
||||
|
||||
X_no_copy = xp.reshape(X, (-1,), copy=False)
|
||||
assert X_no_copy.base is X
|
||||
|
||||
X_copy = xp.reshape(X, (6, 1), copy=True)
|
||||
assert X_copy.base is not X.base
|
||||
|
||||
with pytest.raises(TypeError, match="shape must be a tuple"):
|
||||
xp.reshape(X, -1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("wrapper", [_ArrayAPIWrapper, _NumPyAPIWrapper])
|
||||
def test_get_namespace_array_api_isdtype(wrapper):
|
||||
"""Test isdtype implementation from _ArrayAPIWrapper and _NumPyAPIWrapper."""
|
||||
|
||||
if wrapper == _ArrayAPIWrapper:
|
||||
xp_ = pytest.importorskip("array_api_strict")
|
||||
xp = _ArrayAPIWrapper(xp_)
|
||||
else:
|
||||
xp = _NumPyAPIWrapper()
|
||||
|
||||
assert xp.isdtype(xp.float32, xp.float32)
|
||||
assert xp.isdtype(xp.float32, "real floating")
|
||||
assert xp.isdtype(xp.float64, "real floating")
|
||||
assert not xp.isdtype(xp.int32, "real floating")
|
||||
|
||||
for dtype in supported_float_dtypes(xp):
|
||||
assert xp.isdtype(dtype, "real floating")
|
||||
|
||||
assert xp.isdtype(xp.bool, "bool")
|
||||
assert not xp.isdtype(xp.float32, "bool")
|
||||
|
||||
assert xp.isdtype(xp.int16, "signed integer")
|
||||
assert not xp.isdtype(xp.uint32, "signed integer")
|
||||
|
||||
assert xp.isdtype(xp.uint16, "unsigned integer")
|
||||
assert not xp.isdtype(xp.int64, "unsigned integer")
|
||||
|
||||
assert xp.isdtype(xp.int64, "numeric")
|
||||
assert xp.isdtype(xp.float32, "numeric")
|
||||
assert xp.isdtype(xp.uint32, "numeric")
|
||||
|
||||
assert not xp.isdtype(xp.float32, "complex floating")
|
||||
|
||||
if wrapper == _NumPyAPIWrapper:
|
||||
assert not xp.isdtype(xp.int8, "complex floating")
|
||||
assert xp.isdtype(xp.complex64, "complex floating")
|
||||
assert xp.isdtype(xp.complex128, "complex floating")
|
||||
|
||||
with pytest.raises(ValueError, match="Unrecognized data type"):
|
||||
assert xp.isdtype(xp.int16, "unknown")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
def test_indexing_dtype(namespace, _device, _dtype):
|
||||
xp = _array_api_for_tests(namespace, _device)
|
||||
|
||||
if _IS_32BIT:
|
||||
assert indexing_dtype(xp) == xp.int32
|
||||
else:
|
||||
assert indexing_dtype(xp) == xp.int64
|
||||
|
||||
|
||||
def test_get_namespace_and_device():
|
||||
# Use torch as a library with custom Device objects:
|
||||
torch = pytest.importorskip("torch")
|
||||
xp_torch = pytest.importorskip("array_api_compat.torch")
|
||||
some_torch_tensor = torch.arange(3, device="cpu")
|
||||
some_numpy_array = numpy.arange(3)
|
||||
|
||||
# When dispatch is disabled, get_namespace_and_device should return the
|
||||
# default NumPy wrapper namespace and no device. Our code will handle such
|
||||
# inputs via the usual __array__ interface without attempting to dispatch
|
||||
# via the array API.
|
||||
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
|
||||
assert namespace is get_namespace(some_numpy_array)[0]
|
||||
assert not is_array_api
|
||||
assert device is None
|
||||
|
||||
# Otherwise, expose the torch namespace and device via array API compat
|
||||
# wrapper.
|
||||
with config_context(array_api_dispatch=True):
|
||||
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
|
||||
assert namespace is xp_torch
|
||||
assert is_array_api
|
||||
assert device == some_torch_tensor.device
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("axis", [0, 1, None, -1, -2])
|
||||
@pytest.mark.parametrize("sample_weight_type", [None, "int", "float"])
|
||||
def test_count_nonzero(
|
||||
array_namespace, device_, dtype_name, csr_container, axis, sample_weight_type
|
||||
):
|
||||
|
||||
from sklearn.utils.sparsefuncs import count_nonzero as sparse_count_nonzero
|
||||
|
||||
xp = _array_api_for_tests(array_namespace, device_)
|
||||
array = numpy.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]])
|
||||
if sample_weight_type == "int":
|
||||
sample_weight = numpy.asarray([1, 2, 2, 3, 1])
|
||||
elif sample_weight_type == "float":
|
||||
sample_weight = numpy.asarray([0.5, 1.5, 0.8, 3.2, 2.4], dtype=dtype_name)
|
||||
else:
|
||||
sample_weight = None
|
||||
expected = sparse_count_nonzero(
|
||||
csr_container(array), axis=axis, sample_weight=sample_weight
|
||||
)
|
||||
array_xp = xp.asarray(array, device=device_)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
result = _count_nonzero(
|
||||
array_xp, xp=xp, device=device_, axis=axis, sample_weight=sample_weight
|
||||
)
|
||||
|
||||
assert_allclose(_convert_to_numpy(result, xp=xp), expected)
|
||||
|
||||
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
|
||||
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
|
||||
# https://github.com/numpy/numpy/issues/26850
|
||||
assert device(array_xp) == device(result)
|
||||
@@ -0,0 +1,40 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
|
||||
|
||||
|
||||
def test_min_pos():
|
||||
# Check that min_pos returns a positive value and that it's consistent
|
||||
# between float and double
|
||||
X = np.random.RandomState(0).randn(100)
|
||||
|
||||
min_double = min_pos(X)
|
||||
min_float = min_pos(X.astype(np.float32))
|
||||
|
||||
assert_allclose(min_double, min_float)
|
||||
assert min_double >= 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_min_pos_no_positive(dtype):
|
||||
# Check that the return value of min_pos is the maximum representable
|
||||
# value of the input dtype when all input elements are <= 0 (#19328)
|
||||
X = np.full(100, -1.0).astype(dtype, copy=False)
|
||||
|
||||
assert min_pos(X) == np.finfo(dtype).max
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
|
||||
)
|
||||
@pytest.mark.parametrize("value", [0, 1.5, -1])
|
||||
def test_all_with_any_reduction_axis_1(dtype, value):
|
||||
# Check that return value is False when there is no row equal to `value`
|
||||
X = np.arange(12, dtype=dtype).reshape(3, 4)
|
||||
assert not _all_with_any_reduction_axis_1(X, value=value)
|
||||
|
||||
# Make a row equal to `value`
|
||||
X[1, :] = value
|
||||
assert _all_with_any_reduction_axis_1(X, value=value)
|
||||
@@ -0,0 +1,32 @@
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils import Bunch
|
||||
|
||||
|
||||
def test_bunch_attribute_deprecation():
|
||||
"""Check that bunch raises deprecation message with `__getattr__`."""
|
||||
bunch = Bunch()
|
||||
values = np.asarray([1, 2, 3])
|
||||
msg = (
|
||||
"Key: 'values', is deprecated in 1.3 and will be "
|
||||
"removed in 1.5. Please use 'grid_values' instead"
|
||||
)
|
||||
bunch._set_deprecated(
|
||||
values, new_key="grid_values", deprecated_key="values", warning_message=msg
|
||||
)
|
||||
|
||||
with warnings.catch_warnings():
|
||||
# Does not warn for "grid_values"
|
||||
warnings.simplefilter("error")
|
||||
v = bunch["grid_values"]
|
||||
|
||||
assert v is values
|
||||
|
||||
with pytest.warns(FutureWarning, match=msg):
|
||||
# Warns for "values"
|
||||
v = bunch["values"]
|
||||
|
||||
assert v is values
|
||||
@@ -0,0 +1,73 @@
|
||||
import warnings
|
||||
from itertools import chain
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
|
||||
|
||||
def test_gen_even_slices():
|
||||
# check that gen_even_slices contains all samples
|
||||
some_range = range(10)
|
||||
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
|
||||
assert_array_equal(some_range, joined_range)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("row_bytes", "max_n_rows", "working_memory", "expected"),
|
||||
[
|
||||
(1024, None, 1, 1024),
|
||||
(1024, None, 0.99999999, 1023),
|
||||
(1023, None, 1, 1025),
|
||||
(1025, None, 1, 1023),
|
||||
(1024, None, 2, 2048),
|
||||
(1024, 7, 1, 7),
|
||||
(1024 * 1024, None, 1, 1),
|
||||
],
|
||||
)
|
||||
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
with config_context(working_memory=working_memory):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", UserWarning)
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
|
||||
def test_get_chunk_n_rows_warns():
|
||||
"""Check that warning is raised when working_memory is too low."""
|
||||
row_bytes = 1024 * 1024 + 1
|
||||
max_n_rows = None
|
||||
working_memory = 1
|
||||
expected = 1
|
||||
|
||||
warn_msg = (
|
||||
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(
|
||||
row_bytes=row_bytes,
|
||||
max_n_rows=max_n_rows,
|
||||
working_memory=working_memory,
|
||||
)
|
||||
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
with pytest.warns(UserWarning, match=warn_msg):
|
||||
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
|
||||
assert actual == expected
|
||||
assert type(actual) is type(expected)
|
||||
@@ -0,0 +1,316 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
|
||||
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS
|
||||
|
||||
|
||||
def test_compute_class_weight():
|
||||
# Test (and demo) compute_class_weight.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
# total effect of samples is preserved
|
||||
class_counts = np.bincount(y)[2:]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert cw[0] < cw[1] < cw[2]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_type, class_weight, classes, err_msg",
|
||||
[
|
||||
(
|
||||
"numeric",
|
||||
"balanced",
|
||||
np.arange(4),
|
||||
"classes should have valid labels that are in y",
|
||||
),
|
||||
# Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
|
||||
(
|
||||
"numeric",
|
||||
{"label_not_present": 1.0},
|
||||
np.arange(4),
|
||||
r"The classes, \[0, 1, 2, 3\], are not in class_weight",
|
||||
),
|
||||
(
|
||||
"numeric",
|
||||
"balanced",
|
||||
np.arange(2),
|
||||
"classes should include all valid labels",
|
||||
),
|
||||
(
|
||||
"numeric",
|
||||
{0: 1.0, 1: 2.0},
|
||||
np.arange(2),
|
||||
"classes should include all valid labels",
|
||||
),
|
||||
(
|
||||
"string",
|
||||
{"dogs": 3, "cat": 2},
|
||||
np.array(["dog", "cat"]),
|
||||
r"The classes, \['dog'\], are not in class_weight",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
|
||||
# Raise error when y does not contain all class labels
|
||||
y = (
|
||||
np.asarray([0, 0, 0, 1, 1, 2])
|
||||
if y_type == "numeric"
|
||||
else np.asarray(["dog", "cat", "dog"])
|
||||
)
|
||||
|
||||
print(y)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
compute_class_weight(class_weight, classes=classes, y=y)
|
||||
|
||||
|
||||
def test_compute_class_weight_dict():
|
||||
classes = np.arange(3)
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
|
||||
y = np.asarray([0, 0, 1, 2])
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
|
||||
# When the user specifies class weights, compute_class_weights should just
|
||||
# return them.
|
||||
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
|
||||
|
||||
# When a class weight is specified that isn't in classes, the weight is ignored
|
||||
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([1.0, 2.0, 3.0], cw)
|
||||
|
||||
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
|
||||
cw = compute_class_weight(class_weights, classes=classes, y=y)
|
||||
assert_allclose([4.0, 2.0, 3.0], cw)
|
||||
|
||||
|
||||
def test_compute_class_weight_invariance():
|
||||
# Test that results with class_weight="balanced" is invariant wrt
|
||||
# class imbalance if the number of samples is identical.
|
||||
# The test uses a balanced two class dataset with 100 datapoints.
|
||||
# It creates three versions, one where class 1 is duplicated
|
||||
# resulting in 150 points of class 1 and 50 of class 0,
|
||||
# one where there are 50 points in class 1 and 150 in class 0,
|
||||
# and one where there are 100 points of each class (this one is balanced
|
||||
# again).
|
||||
# With balancing class weights, all three should give the same model.
|
||||
X, y = make_blobs(centers=2, random_state=0)
|
||||
# create dataset where class 1 is duplicated twice
|
||||
X_1 = np.vstack([X] + [X[y == 1]] * 2)
|
||||
y_1 = np.hstack([y] + [y[y == 1]] * 2)
|
||||
# create dataset where class 0 is duplicated twice
|
||||
X_0 = np.vstack([X] + [X[y == 0]] * 2)
|
||||
y_0 = np.hstack([y] + [y[y == 0]] * 2)
|
||||
# duplicate everything
|
||||
X_ = np.vstack([X] * 2)
|
||||
y_ = np.hstack([y] * 2)
|
||||
# results should be identical
|
||||
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
|
||||
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
|
||||
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
|
||||
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
|
||||
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_negative():
|
||||
# Test compute_class_weight when labels are negative
|
||||
# Test with balanced class labels.
|
||||
classes = np.array([-2, -1, 0])
|
||||
y = np.asarray([-1, -1, 0, 0, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
|
||||
|
||||
# Test with unbalanced class labels.
|
||||
y = np.asarray([-1, 0, 0, -2, -2, -2])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
assert len(cw) == len(classes)
|
||||
class_counts = np.bincount(y + 2)
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2.0 / 3, 2.0, 1.0])
|
||||
|
||||
|
||||
def test_compute_class_weight_balanced_unordered():
|
||||
# Test compute_class_weight when classes are unordered
|
||||
classes = np.array([1, 0, 3])
|
||||
y = np.asarray([1, 0, 0, 3, 3, 3])
|
||||
|
||||
cw = compute_class_weight("balanced", classes=classes, y=y)
|
||||
class_counts = np.bincount(y)[classes]
|
||||
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
|
||||
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
|
||||
|
||||
|
||||
def test_compute_class_weight_default():
|
||||
# Test for the case where no weight is given for a present class.
|
||||
# Current behaviour is to assign the unweighted classes a weight of 1.
|
||||
y = np.asarray([2, 2, 2, 3, 3, 4])
|
||||
classes = np.unique(y)
|
||||
classes_len = len(classes)
|
||||
|
||||
# Test for non specified weights
|
||||
cw = compute_class_weight(None, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, np.ones(3))
|
||||
|
||||
# Tests for partly specified weights
|
||||
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
|
||||
|
||||
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
|
||||
assert len(cw) == classes_len
|
||||
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
|
||||
|
||||
|
||||
def test_compute_sample_weight():
|
||||
# Test (and demo) compute_sample_weight.
|
||||
# Test with balanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with user-defined weights
|
||||
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with unbalanced classes
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
expected_balanced = np.array(
|
||||
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
|
||||
)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
|
||||
|
||||
# Test with `None` weights
|
||||
sample_weight = compute_sample_weight(None, y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output of balanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with multi-output with user-defined weights
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
|
||||
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with multi-output of unbalanced classes
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
|
||||
|
||||
|
||||
def test_compute_sample_weight_with_subsample():
|
||||
# Test compute_sample_weight with subsamples specified.
|
||||
# Test with balanced classes and all samples present
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with column vector of balanced classes and all samples present
|
||||
y = np.asarray([[1], [1], [1], [2], [2], [2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
|
||||
|
||||
# Test with a subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
|
||||
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
|
||||
|
||||
# Test with a bootstrap subsample
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced)
|
||||
|
||||
# Test with a bootstrap subsample for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
|
||||
assert_array_almost_equal(sample_weight, expected_balanced**2)
|
||||
|
||||
# Test with a missing class
|
||||
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
# Test with a missing class for multi-output
|
||||
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
|
||||
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
|
||||
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"y_type, class_weight, indices, err_msg",
|
||||
[
|
||||
(
|
||||
"single-output",
|
||||
{1: 2, 2: 1},
|
||||
range(4),
|
||||
"The only valid class_weight for subsampling is 'balanced'.",
|
||||
),
|
||||
(
|
||||
"multi-output",
|
||||
{1: 2, 2: 1},
|
||||
None,
|
||||
"For multi-output, class_weight should be a list of dicts, or the string",
|
||||
),
|
||||
(
|
||||
"multi-output",
|
||||
[{1: 2, 2: 1}],
|
||||
None,
|
||||
r"Got 1 element\(s\) while having 2 outputs",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
|
||||
# Test compute_sample_weight raises errors expected.
|
||||
# Invalid preset string
|
||||
y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
|
||||
y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
|
||||
|
||||
y = y_single_output if y_type == "single-output" else y_multi_output
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
compute_sample_weight(class_weight, y, indices=indices)
|
||||
|
||||
|
||||
def test_compute_sample_weight_more_than_32():
|
||||
# Non-regression smoke test for #12146
|
||||
y = np.arange(50) # more than 32 distinct classes
|
||||
indices = np.arange(50) # use subsampling
|
||||
weight = compute_sample_weight("balanced", y, indices=indices)
|
||||
assert_array_almost_equal(weight, np.ones(y.shape[0]))
|
||||
|
||||
|
||||
def test_class_weight_does_not_contains_more_classes():
|
||||
"""Check that class_weight can contain more labels than in y.
|
||||
|
||||
Non-regression test for #22413
|
||||
"""
|
||||
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
|
||||
|
||||
# Does not raise
|
||||
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_compute_sample_weight_sparse(csc_container):
|
||||
"""Check that we can compute weight for sparse `y`."""
|
||||
y = csc_container(np.asarray([[0], [1], [1]]))
|
||||
sample_weight = compute_sample_weight("balanced", y)
|
||||
assert_allclose(sample_weight, [1.5, 0.75, 0.75])
|
||||
@@ -0,0 +1,234 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._cython_blas import (
|
||||
ColMajor,
|
||||
NoTrans,
|
||||
RowMajor,
|
||||
Trans,
|
||||
_asum_memview,
|
||||
_axpy_memview,
|
||||
_copy_memview,
|
||||
_dot_memview,
|
||||
_gemm_memview,
|
||||
_gemv_memview,
|
||||
_ger_memview,
|
||||
_nrm2_memview,
|
||||
_rot_memview,
|
||||
_rotg_memview,
|
||||
_scal_memview,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
def _numpy_to_cython(dtype):
|
||||
cython = pytest.importorskip("cython")
|
||||
if dtype == np.float32:
|
||||
return cython.float
|
||||
elif dtype == np.float64:
|
||||
return cython.double
|
||||
|
||||
|
||||
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
|
||||
ORDER = {RowMajor: "C", ColMajor: "F"}
|
||||
|
||||
|
||||
def _no_op(x):
|
||||
return x
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_dot(dtype):
|
||||
dot = _dot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = x.dot(y)
|
||||
actual = dot(x, y)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_asum(dtype):
|
||||
asum = _asum_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.abs(x).sum()
|
||||
actual = asum(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_axpy(dtype):
|
||||
axpy = _axpy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x + y
|
||||
axpy(alpha, x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_nrm2(dtype):
|
||||
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
|
||||
expected = np.linalg.norm(x)
|
||||
actual = nrm2(x)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_copy(dtype):
|
||||
copy = _copy_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = np.empty_like(x)
|
||||
|
||||
expected = x.copy()
|
||||
copy(x, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_scal(dtype):
|
||||
scal = _scal_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * x
|
||||
scal(alpha, x)
|
||||
|
||||
assert_allclose(x, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rotg(dtype):
|
||||
rotg = _rotg_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
a = dtype(rng.randn())
|
||||
b = dtype(rng.randn())
|
||||
c, s = 0.0, 0.0
|
||||
|
||||
def expected_rotg(a, b):
|
||||
roe = a if abs(a) > abs(b) else b
|
||||
if a == 0 and b == 0:
|
||||
c, s, r, z = (1, 0, 0, 0)
|
||||
else:
|
||||
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
|
||||
c, s = a / r, b / r
|
||||
z = s if roe == a else (1 if c == 0 else 1 / c)
|
||||
return r, z, c, s
|
||||
|
||||
expected = expected_rotg(a, b)
|
||||
actual = rotg(a, b, c, s)
|
||||
|
||||
assert_allclose(actual, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_rot(dtype):
|
||||
rot = _rot_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(10).astype(dtype, copy=False)
|
||||
c = dtype(rng.randn())
|
||||
s = dtype(rng.randn())
|
||||
|
||||
expected_x = c * x + s * y
|
||||
expected_y = c * y - s * x
|
||||
|
||||
rot(x, y, c, s)
|
||||
|
||||
assert_allclose(x, expected_x)
|
||||
assert_allclose(y, expected_y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_gemv(dtype, opA, transA, order):
|
||||
gemv = _gemv_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(x) + beta * y
|
||||
gemv(transA, alpha, A, x, beta, y)
|
||||
|
||||
assert_allclose(y, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_ger(dtype, order):
|
||||
ger = _ger_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.random_sample(10).astype(dtype, copy=False)
|
||||
y = rng.random_sample(20).astype(dtype, copy=False)
|
||||
A = np.asarray(
|
||||
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha = 2.5
|
||||
|
||||
expected = alpha * np.outer(x, y) + A
|
||||
ger(alpha, x, y, A)
|
||||
|
||||
assert_allclose(A, expected, rtol=RTOL[dtype])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize(
|
||||
"opB, transB", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
|
||||
)
|
||||
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
|
||||
def test_gemm(dtype, opA, transA, opB, transB, order):
|
||||
gemm = _gemm_memview[_numpy_to_cython(dtype)]
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = np.asarray(
|
||||
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
B = np.asarray(
|
||||
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
|
||||
)
|
||||
C = np.asarray(
|
||||
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
|
||||
)
|
||||
alpha, beta = 2.5, -0.5
|
||||
|
||||
expected = alpha * opA(A).dot(opB(B)) + beta * C
|
||||
gemm(transA, transB, alpha, A, B, beta, C)
|
||||
|
||||
assert_allclose(C, expected, rtol=RTOL[dtype])
|
||||
@@ -0,0 +1,22 @@
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
|
||||
|
||||
def test_files_generated_by_templates_are_git_ignored():
|
||||
"""Check the consistence of the files generated from template files."""
|
||||
gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore"
|
||||
if not gitignore_file.exists():
|
||||
pytest.skip("Tests are not run from the source folder")
|
||||
|
||||
base_dir = pathlib.Path(sklearn.__file__).parent
|
||||
ignored_files = gitignore_file.read_text().split("\n")
|
||||
ignored_files = [pathlib.Path(line) for line in ignored_files]
|
||||
|
||||
for filename in base_dir.glob("**/*.tp"):
|
||||
filename = filename.relative_to(base_dir.parent)
|
||||
# From "path/to/template.p??.tp" to "path/to/template.p??"
|
||||
filename_wo_tempita_suffix = filename.with_suffix("")
|
||||
assert filename_wo_tempita_suffix in ignored_files
|
||||
@@ -0,0 +1,88 @@
|
||||
# Authors: Raghav RV <rvraghav93@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.deprecation import _is_deprecated, deprecated
|
||||
|
||||
|
||||
@deprecated("qwerty")
|
||||
class MockClass1:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass2:
|
||||
@deprecated("mockclass2_method")
|
||||
def method(self):
|
||||
pass
|
||||
|
||||
@deprecated("n_features_ is deprecated") # type: ignore
|
||||
@property
|
||||
def n_features_(self):
|
||||
"""Number of input features."""
|
||||
return 10
|
||||
|
||||
|
||||
class MockClass3:
|
||||
@deprecated()
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
|
||||
class MockClass4:
|
||||
pass
|
||||
|
||||
|
||||
class MockClass5(MockClass1):
|
||||
"""Inherit from deprecated class but does not call super().__init__."""
|
||||
|
||||
def __init__(self, a):
|
||||
self.a = a
|
||||
|
||||
|
||||
@deprecated("a message")
|
||||
class MockClass6:
|
||||
"""A deprecated class that overrides __new__."""
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
assert len(args) > 0
|
||||
return super().__new__(cls)
|
||||
|
||||
|
||||
@deprecated()
|
||||
def mock_function():
|
||||
return 10
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
with pytest.warns(FutureWarning, match="qwerty"):
|
||||
MockClass1()
|
||||
with pytest.warns(FutureWarning, match="mockclass2_method"):
|
||||
MockClass2().method()
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
MockClass3()
|
||||
with pytest.warns(FutureWarning, match="qwerty"):
|
||||
MockClass5(42)
|
||||
with pytest.warns(FutureWarning, match="a message"):
|
||||
MockClass6(42)
|
||||
with pytest.warns(FutureWarning, match="deprecated"):
|
||||
val = mock_function()
|
||||
assert val == 10
|
||||
|
||||
|
||||
def test_is_deprecated():
|
||||
# Test if _is_deprecated helper identifies wrapping via deprecated
|
||||
# NOTE it works only for class methods and functions
|
||||
assert _is_deprecated(MockClass1.__new__)
|
||||
assert _is_deprecated(MockClass2().method)
|
||||
assert _is_deprecated(MockClass3.__init__)
|
||||
assert not _is_deprecated(MockClass4.__init__)
|
||||
assert _is_deprecated(MockClass5.__new__)
|
||||
assert _is_deprecated(mock_function)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
pickle.loads(pickle.dumps(mock_function))
|
||||
@@ -0,0 +1,274 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, expected",
|
||||
[
|
||||
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
|
||||
(
|
||||
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
|
||||
np.array([1, 2, np.nan], dtype="float32"),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", "c", "a", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
),
|
||||
(
|
||||
np.array(["b", "a", None, "a", None], dtype=object),
|
||||
np.array(["a", "b", None], dtype=object),
|
||||
),
|
||||
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
|
||||
],
|
||||
ids=["int64", "float32-nan", "object", "object-None", "str"],
|
||||
)
|
||||
def test_encode_util(values, expected):
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected)
|
||||
|
||||
result, encoded = _unique(values, return_inverse=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
|
||||
result, counts = _unique(values, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
|
||||
assert_array_equal(result, expected)
|
||||
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
|
||||
assert_array_equal(counts, np.array([2, 1, 2]))
|
||||
|
||||
|
||||
def test_encode_with_check_unknown():
|
||||
# test for the check_unknown parameter of _encode()
|
||||
uniques = np.array([1, 2, 3])
|
||||
values = np.array([1, 2, 3, 4])
|
||||
|
||||
# Default is True, raise error
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=True)
|
||||
|
||||
# dont raise error if False
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
# parameter is ignored for object dtype
|
||||
uniques = np.array(["a", "b", "c"], dtype=object)
|
||||
values = np.array(["a", "b", "c", "d"], dtype=object)
|
||||
with pytest.raises(ValueError, match="y contains previously unseen labels"):
|
||||
_encode(values, uniques=uniques, check_unknown=False)
|
||||
|
||||
|
||||
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
diff = _check_unknown(values, uniques)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
|
||||
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
|
||||
assert_array_equal(diff, expected_diff)
|
||||
assert_array_equal(valid_mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_diff, expected_mask",
|
||||
[
|
||||
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
|
||||
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
|
||||
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, np.nan]),
|
||||
np.array([2, 5, 1]),
|
||||
[4, np.nan],
|
||||
[True, True, False, False],
|
||||
),
|
||||
(
|
||||
np.array([2, 1, 4, 5]),
|
||||
np.array([2, 5, 1, np.nan]),
|
||||
[4],
|
||||
[True, True, False, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"], dtype=object),
|
||||
np.array(["a", "c", "b"], dtype=object),
|
||||
np.array(["d"], dtype=object),
|
||||
[False, True, True, True],
|
||||
),
|
||||
(
|
||||
np.array(["a", "b", "c", "d"]),
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["d"]),
|
||||
[True, True, True, False],
|
||||
),
|
||||
(
|
||||
np.array(["d", "c", "a", "b"]),
|
||||
np.array(["a", "c", "b"]),
|
||||
np.array(["d"]),
|
||||
[False, True, True, True],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_check_unknown(values, uniques, expected_diff, expected_mask):
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_check_unknown_missing_values(missing_value, pickle_uniques):
|
||||
# check for check_unknown with missing values with object dtypes
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d"]
|
||||
expected_mask = [False, True, True, True, True]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
|
||||
uniques = np.array(["c", "a", "b"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = ["d", missing_value]
|
||||
|
||||
expected_mask = [False, True, True, True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
values = np.array(["a", missing_value], dtype=object)
|
||||
uniques = np.array(["a", "b", "z"], dtype=object)
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
expected_diff = [missing_value]
|
||||
expected_mask = [True, False]
|
||||
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
|
||||
@pytest.mark.parametrize("pickle_uniques", [True, False])
|
||||
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
|
||||
# check for _unique and _encode with missing values with object dtypes
|
||||
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
|
||||
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
|
||||
if missing_value is None:
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
else: # missing_value == np.nan
|
||||
assert_array_equal(uniques[:-1], expected_uniques[:-1])
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
if pickle_uniques:
|
||||
uniques = pickle.loads(pickle.dumps(uniques))
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
|
||||
|
||||
|
||||
def test_unique_util_missing_values_numeric():
|
||||
# Check missing values in numerical values
|
||||
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
|
||||
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
|
||||
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
|
||||
uniques, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(uniques, expected_uniques)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
encoded = _encode(values, uniques=uniques)
|
||||
assert_array_equal(encoded, expected_inverse)
|
||||
|
||||
|
||||
def test_unique_util_with_all_missing_values():
|
||||
# test for all types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
|
||||
|
||||
uniques = _unique(values)
|
||||
assert_array_equal(uniques[:-1], ["a", "c", None])
|
||||
# last value is nan
|
||||
assert np.isnan(uniques[-1])
|
||||
|
||||
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
|
||||
_, inverse = _unique(values, return_inverse=True)
|
||||
assert_array_equal(inverse, expected_inverse)
|
||||
|
||||
|
||||
def test_check_unknown_with_both_missing_values():
|
||||
# test for both types of missing values for object dtype
|
||||
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
|
||||
|
||||
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
|
||||
diff, valid_mask = _check_unknown(
|
||||
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
|
||||
)
|
||||
|
||||
assert diff[0] is None
|
||||
assert np.isnan(diff[1])
|
||||
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, uniques, expected_counts",
|
||||
[
|
||||
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
|
||||
(
|
||||
np.array([1] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([1, 2, 3, 5]),
|
||||
[10, 4, 15, 0],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
|
||||
np.array([2, 3, np.nan]),
|
||||
[4, 15, 10],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c"],
|
||||
[16, 4, 20],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", "b", "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["c", np.nan, "a"],
|
||||
[20, 4, 16],
|
||||
),
|
||||
(
|
||||
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
|
||||
["a", "b", "c", "e"],
|
||||
[16, 4, 20, 0],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_counts(values, uniques, expected_counts):
|
||||
counts = _get_counts(values, uniques)
|
||||
assert_array_equal(counts, expected_counts)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,566 @@
|
||||
import html
|
||||
import locale
|
||||
import re
|
||||
import types
|
||||
from contextlib import closing
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.cluster import AgglomerativeClustering, Birch
|
||||
from sklearn.compose import ColumnTransformer, make_column_transformer
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.decomposition import PCA, TruncatedSVD
|
||||
from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.gaussian_process.kernels import ExpSineSquared
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.kernel_ridge import KernelRidge
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import RandomizedSearchCV
|
||||
from sklearn.multiclass import OneVsOneClassifier
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
|
||||
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
||||
from sklearn.svm import LinearSVC, LinearSVR
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.utils._estimator_html_repr import (
|
||||
_get_css_style,
|
||||
_get_visual_block,
|
||||
_HTMLDocumentationLinkMixin,
|
||||
_write_label_html,
|
||||
estimator_html_repr,
|
||||
)
|
||||
from sklearn.utils.fixes import parse_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("checked", [True, False])
|
||||
def test_write_label_html(checked):
|
||||
# Test checking logic and labeling
|
||||
name = "LogisticRegression"
|
||||
tool_tip = "hello-world"
|
||||
|
||||
with closing(StringIO()) as out:
|
||||
_write_label_html(out, name, tool_tip, checked=checked)
|
||||
html_label = out.getvalue()
|
||||
|
||||
p = (
|
||||
r'<label for="sk-estimator-id-[0-9]*"'
|
||||
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
|
||||
r"LogisticRegression"
|
||||
)
|
||||
re_compiled = re.compile(p)
|
||||
assert re_compiled.search(html_label)
|
||||
|
||||
assert html_label.startswith('<div class="sk-label-container">')
|
||||
assert "<pre>hello-world</pre>" in html_label
|
||||
if checked:
|
||||
assert "checked>" in html_label
|
||||
|
||||
|
||||
@pytest.mark.parametrize("est", ["passthrough", "drop", None])
|
||||
def test_get_visual_block_single_str_none(est):
|
||||
# Test estimators that are represented by strings
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == "single"
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == str(est)
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_single_estimator():
|
||||
est = LogisticRegression(C=10.0)
|
||||
est_html_info = _get_visual_block(est)
|
||||
assert est_html_info.kind == "single"
|
||||
assert est_html_info.estimators == est
|
||||
assert est_html_info.names == est.__class__.__name__
|
||||
assert est_html_info.name_details == str(est)
|
||||
|
||||
|
||||
def test_get_visual_block_pipeline():
|
||||
pipe = Pipeline(
|
||||
[
|
||||
("imputer", SimpleImputer()),
|
||||
("do_nothing", "passthrough"),
|
||||
("do_nothing_more", None),
|
||||
("classifier", LogisticRegression()),
|
||||
]
|
||||
)
|
||||
est_html_info = _get_visual_block(pipe)
|
||||
assert est_html_info.kind == "serial"
|
||||
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
|
||||
assert est_html_info.names == [
|
||||
"imputer: SimpleImputer",
|
||||
"do_nothing: passthrough",
|
||||
"do_nothing_more: passthrough",
|
||||
"classifier: LogisticRegression",
|
||||
]
|
||||
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
|
||||
|
||||
|
||||
def test_get_visual_block_feature_union():
|
||||
f_union = FeatureUnion([("pca", PCA()), ("svd", TruncatedSVD())])
|
||||
est_html_info = _get_visual_block(f_union)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.names == ("pca", "svd")
|
||||
assert est_html_info.estimators == tuple(
|
||||
trans[1] for trans in f_union.transformer_list
|
||||
)
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_voting():
|
||||
clf = VotingClassifier(
|
||||
[("log_reg", LogisticRegression()), ("mlp", MLPClassifier())]
|
||||
)
|
||||
est_html_info = _get_visual_block(clf)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.estimators == tuple(trans[1] for trans in clf.estimators)
|
||||
assert est_html_info.names == ("log_reg", "mlp")
|
||||
assert est_html_info.name_details == (None, None)
|
||||
|
||||
|
||||
def test_get_visual_block_column_transformer():
|
||||
ct = ColumnTransformer(
|
||||
[("pca", PCA(), ["num1", "num2"]), ("svd", TruncatedSVD, [0, 3])]
|
||||
)
|
||||
est_html_info = _get_visual_block(ct)
|
||||
assert est_html_info.kind == "parallel"
|
||||
assert est_html_info.estimators == tuple(trans[1] for trans in ct.transformers)
|
||||
assert est_html_info.names == ("pca", "svd")
|
||||
assert est_html_info.name_details == (["num1", "num2"], [0, 3])
|
||||
|
||||
|
||||
def test_estimator_html_repr_pipeline():
|
||||
num_trans = Pipeline(
|
||||
steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]
|
||||
)
|
||||
|
||||
cat_trans = Pipeline(
|
||||
steps=[
|
||||
("imputer", SimpleImputer(strategy="constant", missing_values="empty")),
|
||||
("one-hot", OneHotEncoder(drop="first")),
|
||||
]
|
||||
)
|
||||
|
||||
preprocess = ColumnTransformer(
|
||||
[
|
||||
("num", num_trans, ["a", "b", "c", "d", "e"]),
|
||||
("cat", cat_trans, [0, 1, 2, 3]),
|
||||
]
|
||||
)
|
||||
|
||||
feat_u = FeatureUnion(
|
||||
[
|
||||
("pca", PCA(n_components=1)),
|
||||
(
|
||||
"tsvd",
|
||||
Pipeline(
|
||||
[
|
||||
("first", TruncatedSVD(n_components=3)),
|
||||
("select", SelectPercentile()),
|
||||
]
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
clf = VotingClassifier(
|
||||
[
|
||||
("lr", LogisticRegression(solver="lbfgs", random_state=1)),
|
||||
("mlp", MLPClassifier(alpha=0.001)),
|
||||
]
|
||||
)
|
||||
|
||||
pipe = Pipeline(
|
||||
[("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]
|
||||
)
|
||||
html_output = estimator_html_repr(pipe)
|
||||
|
||||
# top level estimators show estimator with changes
|
||||
assert html.escape(str(pipe)) in html_output
|
||||
for _, est in pipe.steps:
|
||||
assert (
|
||||
'<div class="sk-toggleable__content "><pre>' + html.escape(str(est))
|
||||
) in html_output
|
||||
|
||||
# low level estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert html.escape(str(num_trans["pass"])) in html_output
|
||||
assert "passthrough</label>" in html_output
|
||||
assert html.escape(str(num_trans["imputer"])) in html_output
|
||||
|
||||
for _, _, cols in preprocess.transformers:
|
||||
assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
|
||||
|
||||
# feature union
|
||||
for name, _ in feat_u.transformer_list:
|
||||
assert f"<label>{html.escape(name)}</label>" in html_output
|
||||
|
||||
pca = feat_u.transformer_list[0][1]
|
||||
assert f"<pre>{html.escape(str(pca))}</pre>" in html_output
|
||||
|
||||
tsvd = feat_u.transformer_list[1][1]
|
||||
first = tsvd["first"]
|
||||
select = tsvd["select"]
|
||||
assert f"<pre>{html.escape(str(first))}</pre>" in html_output
|
||||
assert f"<pre>{html.escape(str(select))}</pre>" in html_output
|
||||
|
||||
# voting classifier
|
||||
for name, est in clf.estimators:
|
||||
assert f"<label>{html.escape(name)}</label>" in html_output
|
||||
assert f"<pre>{html.escape(str(est))}</pre>" in html_output
|
||||
|
||||
# verify that prefers-color-scheme is implemented
|
||||
assert "prefers-color-scheme" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
|
||||
def test_stacking_classifier(final_estimator):
|
||||
estimators = [
|
||||
("mlp", MLPClassifier(alpha=0.001)),
|
||||
("tree", DecisionTreeClassifier()),
|
||||
]
|
||||
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
|
||||
|
||||
html_output = estimator_html_repr(clf)
|
||||
|
||||
assert html.escape(str(clf)) in html_output
|
||||
# If final_estimator's default changes from LogisticRegression
|
||||
# this should be updated
|
||||
if final_estimator is None:
|
||||
assert "LogisticRegression(" in html_output
|
||||
else:
|
||||
assert final_estimator.__class__.__name__ in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
|
||||
def test_stacking_regressor(final_estimator):
|
||||
reg = StackingRegressor(
|
||||
estimators=[("svr", LinearSVR())], final_estimator=final_estimator
|
||||
)
|
||||
html_output = estimator_html_repr(reg)
|
||||
|
||||
assert html.escape(str(reg.estimators[0][0])) in html_output
|
||||
p = (
|
||||
r'<label for="sk-estimator-id-[0-9]*"'
|
||||
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
|
||||
r" LinearSVR"
|
||||
)
|
||||
re_compiled = re.compile(p)
|
||||
assert re_compiled.search(html_output)
|
||||
|
||||
if final_estimator is None:
|
||||
p = (
|
||||
r'<label for="sk-estimator-id-[0-9]*"'
|
||||
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
|
||||
r" RidgeCV"
|
||||
)
|
||||
re_compiled = re.compile(p)
|
||||
assert re_compiled.search(html_output)
|
||||
else:
|
||||
assert html.escape(final_estimator.__class__.__name__) in html_output
|
||||
|
||||
|
||||
def test_birch_duck_typing_meta():
|
||||
# Test duck typing meta estimators with Birch
|
||||
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
|
||||
html_output = estimator_html_repr(birch)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
|
||||
assert "AgglomerativeClustering</label>" in html_output
|
||||
|
||||
# outer estimator contains all changes
|
||||
assert f"<pre>{html.escape(str(birch))}" in html_output
|
||||
|
||||
|
||||
def test_ovo_classifier_duck_typing_meta():
|
||||
# Test duck typing metaestimators with OVO
|
||||
ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
|
||||
html_output = estimator_html_repr(ovo)
|
||||
|
||||
# inner estimators do not show changes
|
||||
with config_context(print_changed_only=True):
|
||||
assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
|
||||
# regex to match the start of the tag
|
||||
p = (
|
||||
r'<label for="sk-estimator-id-[0-9]*" '
|
||||
r'class="sk-toggleable__label sk-toggleable__label-arrow "> LinearSVC'
|
||||
)
|
||||
re_compiled = re.compile(p)
|
||||
assert re_compiled.search(html_output)
|
||||
|
||||
# outer estimator
|
||||
assert f"<pre>{html.escape(str(ovo))}" in html_output
|
||||
|
||||
|
||||
def test_duck_typing_nested_estimator():
|
||||
# Test duck typing metaestimators with random search
|
||||
kernel_ridge = KernelRidge(kernel=ExpSineSquared())
|
||||
param_distributions = {"alpha": [1, 2]}
|
||||
|
||||
kernel_ridge_tuned = RandomizedSearchCV(
|
||||
kernel_ridge,
|
||||
param_distributions=param_distributions,
|
||||
)
|
||||
html_output = estimator_html_repr(kernel_ridge_tuned)
|
||||
assert "estimator: KernelRidge</label>" in html_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("print_changed_only", [True, False])
|
||||
def test_one_estimator_print_change_only(print_changed_only):
|
||||
pca = PCA(n_components=10)
|
||||
|
||||
with config_context(print_changed_only=print_changed_only):
|
||||
pca_repr = html.escape(str(pca))
|
||||
html_output = estimator_html_repr(pca)
|
||||
assert pca_repr in html_output
|
||||
|
||||
|
||||
def test_fallback_exists():
|
||||
"""Check that repr fallback is in the HTML."""
|
||||
pca = PCA(n_components=10)
|
||||
html_output = estimator_html_repr(pca)
|
||||
|
||||
assert (
|
||||
f'<div class="sk-text-repr-fallback"><pre>{html.escape(str(pca))}'
|
||||
in html_output
|
||||
)
|
||||
|
||||
|
||||
def test_show_arrow_pipeline():
|
||||
"""Show arrow in pipeline for top level in pipeline"""
|
||||
pipe = Pipeline([("scale", StandardScaler()), ("log_Reg", LogisticRegression())])
|
||||
|
||||
html_output = estimator_html_repr(pipe)
|
||||
assert (
|
||||
'class="sk-toggleable__label sk-toggleable__label-arrow "> Pipeline'
|
||||
in html_output
|
||||
)
|
||||
|
||||
|
||||
def test_invalid_parameters_in_stacking():
|
||||
"""Invalidate stacking configuration uses default repr.
|
||||
|
||||
Non-regression test for #24009.
|
||||
"""
|
||||
stacker = StackingClassifier(estimators=[])
|
||||
|
||||
html_output = estimator_html_repr(stacker)
|
||||
assert html.escape(str(stacker)) in html_output
|
||||
|
||||
|
||||
def test_estimator_get_params_return_cls():
|
||||
"""Check HTML repr works where a value in get_params is a class."""
|
||||
|
||||
class MyEstimator:
|
||||
def get_params(self, deep=False):
|
||||
return {"inner_cls": LogisticRegression}
|
||||
|
||||
est = MyEstimator()
|
||||
assert "MyEstimator" in estimator_html_repr(est)
|
||||
|
||||
|
||||
def test_estimator_html_repr_unfitted_vs_fitted():
|
||||
"""Check that we have the information that the estimator is fitted or not in the
|
||||
HTML representation.
|
||||
"""
|
||||
|
||||
class MyEstimator(BaseEstimator):
|
||||
def fit(self, X, y):
|
||||
self.fitted_ = True
|
||||
return self
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
estimator = MyEstimator()
|
||||
assert "<span>Not fitted</span>" in estimator_html_repr(estimator)
|
||||
estimator.fit(X, y)
|
||||
assert "<span>Fitted</span>" in estimator_html_repr(estimator)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator",
|
||||
[
|
||||
LogisticRegression(),
|
||||
make_pipeline(StandardScaler(), LogisticRegression()),
|
||||
make_pipeline(
|
||||
make_column_transformer((StandardScaler(), slice(0, 3))),
|
||||
LogisticRegression(),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_estimator_html_repr_fitted_icon(estimator):
|
||||
"""Check that we are showing the fitted status icon only once."""
|
||||
pattern = '<span class="sk-estimator-doc-link ">i<span>Not fitted</span></span>'
|
||||
assert estimator_html_repr(estimator).count(pattern) == 1
|
||||
X, y = load_iris(return_X_y=True)
|
||||
estimator.fit(X, y)
|
||||
pattern = '<span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span>'
|
||||
assert estimator_html_repr(estimator).count(pattern) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mock_version", ["1.3.0.dev0", "1.3.0"])
|
||||
def test_html_documentation_link_mixin_sklearn(mock_version):
|
||||
"""Check the behaviour of the `_HTMLDocumentationLinkMixin` class for scikit-learn
|
||||
default.
|
||||
"""
|
||||
|
||||
# mock the `__version__` where the mixin is located
|
||||
with patch("sklearn.utils._estimator_html_repr.__version__", mock_version):
|
||||
mixin = _HTMLDocumentationLinkMixin()
|
||||
|
||||
assert mixin._doc_link_module == "sklearn"
|
||||
sklearn_version = parse_version(mock_version)
|
||||
# we need to parse the version manually to be sure that this test is passing in
|
||||
# other branches than `main` (that is "dev").
|
||||
if sklearn_version.dev is None:
|
||||
version = f"{sklearn_version.major}.{sklearn_version.minor}"
|
||||
else:
|
||||
version = "dev"
|
||||
assert (
|
||||
mixin._doc_link_template
|
||||
== f"https://scikit-learn.org/{version}/modules/generated/"
|
||||
"{estimator_module}.{estimator_name}.html"
|
||||
)
|
||||
assert (
|
||||
mixin._get_doc_link()
|
||||
== f"https://scikit-learn.org/{version}/modules/generated/"
|
||||
"sklearn.utils._HTMLDocumentationLinkMixin.html"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"module_path,expected_module",
|
||||
[
|
||||
("prefix.mymodule", "prefix.mymodule"),
|
||||
("prefix._mymodule", "prefix"),
|
||||
("prefix.mypackage._mymodule", "prefix.mypackage"),
|
||||
("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
|
||||
("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
|
||||
],
|
||||
)
|
||||
def test_html_documentation_link_mixin_get_doc_link_instance(
|
||||
module_path, expected_module
|
||||
):
|
||||
"""Check the behaviour of the `_get_doc_link` with various parameter."""
|
||||
|
||||
class FooBar(_HTMLDocumentationLinkMixin):
|
||||
pass
|
||||
|
||||
FooBar.__module__ = module_path
|
||||
est = FooBar()
|
||||
# if we set `_doc_link`, then we expect to infer a module and name for the estimator
|
||||
est._doc_link_module = "prefix"
|
||||
est._doc_link_template = (
|
||||
"https://website.com/{estimator_module}.{estimator_name}.html"
|
||||
)
|
||||
assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"module_path,expected_module",
|
||||
[
|
||||
("prefix.mymodule", "prefix.mymodule"),
|
||||
("prefix._mymodule", "prefix"),
|
||||
("prefix.mypackage._mymodule", "prefix.mypackage"),
|
||||
("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
|
||||
("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
|
||||
],
|
||||
)
|
||||
def test_html_documentation_link_mixin_get_doc_link_class(module_path, expected_module):
|
||||
"""Check the behaviour of the `_get_doc_link` when `_doc_link_module` and
|
||||
`_doc_link_template` are defined at the class level and not at the instance
|
||||
level."""
|
||||
|
||||
class FooBar(_HTMLDocumentationLinkMixin):
|
||||
_doc_link_module = "prefix"
|
||||
_doc_link_template = (
|
||||
"https://website.com/{estimator_module}.{estimator_name}.html"
|
||||
)
|
||||
|
||||
FooBar.__module__ = module_path
|
||||
est = FooBar()
|
||||
assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
|
||||
|
||||
|
||||
def test_html_documentation_link_mixin_get_doc_link_out_of_library():
|
||||
"""Check the behaviour of the `_get_doc_link` with various parameter."""
|
||||
mixin = _HTMLDocumentationLinkMixin()
|
||||
|
||||
# if the `_doc_link_module` does not refer to the root module of the estimator
|
||||
# (here the mixin), then we should return an empty string.
|
||||
mixin._doc_link_module = "xxx"
|
||||
assert mixin._get_doc_link() == ""
|
||||
|
||||
|
||||
def test_html_documentation_link_mixin_doc_link_url_param_generator_instance():
|
||||
mixin = _HTMLDocumentationLinkMixin()
|
||||
# we can bypass the generation by providing our own callable
|
||||
mixin._doc_link_template = (
|
||||
"https://website.com/{my_own_variable}.{another_variable}.html"
|
||||
)
|
||||
|
||||
def url_param_generator(estimator):
|
||||
return {
|
||||
"my_own_variable": "value_1",
|
||||
"another_variable": "value_2",
|
||||
}
|
||||
|
||||
mixin._doc_link_url_param_generator = types.MethodType(url_param_generator, mixin)
|
||||
|
||||
assert mixin._get_doc_link() == "https://website.com/value_1.value_2.html"
|
||||
|
||||
|
||||
def test_html_documentation_link_mixin_doc_link_url_param_generator_class():
|
||||
# we can bypass the generation by providing our own callable
|
||||
|
||||
def url_param_generator(estimator):
|
||||
return {
|
||||
"my_own_variable": "value_1",
|
||||
"another_variable": "value_2",
|
||||
}
|
||||
|
||||
class FooBar(_HTMLDocumentationLinkMixin):
|
||||
_doc_link_template = (
|
||||
"https://website.com/{my_own_variable}.{another_variable}.html"
|
||||
)
|
||||
_doc_link_url_param_generator = url_param_generator
|
||||
|
||||
estimator = FooBar()
|
||||
assert estimator._get_doc_link() == "https://website.com/value_1.value_2.html"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def set_non_utf8_locale():
|
||||
"""Pytest fixture to set non utf-8 locale during the test.
|
||||
|
||||
The locale is set to the original one after the test has run.
|
||||
"""
|
||||
try:
|
||||
locale.setlocale(locale.LC_CTYPE, "C")
|
||||
except locale.Error:
|
||||
pytest.skip("'C' locale is not available on this OS")
|
||||
|
||||
yield
|
||||
|
||||
# Resets the locale to the original one. Python calls setlocale(LC_TYPE, "")
|
||||
# at startup according to
|
||||
# https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats.
|
||||
# This assumes that no other locale changes have been made. For some reason,
|
||||
# on some platforms, trying to restore locale with something like
|
||||
# locale.setlocale(locale.LC_CTYPE, locale.getlocale()) raises a
|
||||
# locale.Error: unsupported locale setting
|
||||
locale.setlocale(locale.LC_CTYPE, "")
|
||||
|
||||
|
||||
def test_non_utf8_locale(set_non_utf8_locale):
|
||||
"""Checks that utf8 encoding is used when reading the CSS file.
|
||||
|
||||
Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/27725
|
||||
"""
|
||||
_get_css_style()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,47 @@
|
||||
"""Test fast_dict."""
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn.utils._fast_dict import IntFloatDict, argmin
|
||||
|
||||
|
||||
def test_int_float_dict():
|
||||
rng = np.random.RandomState(0)
|
||||
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
|
||||
values = rng.rand(len(keys))
|
||||
|
||||
d = IntFloatDict(keys, values)
|
||||
for key, value in zip(keys, values):
|
||||
assert d[key] == value
|
||||
assert len(d) == len(keys)
|
||||
|
||||
d.append(120, 3.0)
|
||||
assert d[120] == 3.0
|
||||
assert len(d) == len(keys) + 1
|
||||
for i in range(2000):
|
||||
d.append(i + 1000, 4.0)
|
||||
assert d[1100] == 4.0
|
||||
|
||||
|
||||
def test_int_float_dict_argmin():
|
||||
# Test the argmin implementation on the IntFloatDict
|
||||
keys = np.arange(100, dtype=np.intp)
|
||||
values = np.arange(100, dtype=np.float64)
|
||||
d = IntFloatDict(keys, values)
|
||||
assert argmin(d) == (0, 0)
|
||||
|
||||
|
||||
def test_to_arrays():
|
||||
# Test that an IntFloatDict is converted into arrays
|
||||
# of keys and values correctly
|
||||
keys_in = np.array([1, 2, 3], dtype=np.intp)
|
||||
values_in = np.array([4, 5, 6], dtype=np.float64)
|
||||
|
||||
d = IntFloatDict(keys_in, values_in)
|
||||
keys_out, values_out = d.to_arrays()
|
||||
|
||||
assert keys_out.dtype == keys_in.dtype
|
||||
assert values_in.dtype == values_out.dtype
|
||||
assert_array_equal(keys_out, keys_in)
|
||||
assert_allclose(values_out, values_in)
|
||||
@@ -0,0 +1,162 @@
|
||||
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Justin Vincent
|
||||
# Lars Buitinck
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
|
||||
def test_object_dtype_isnan(dtype, val):
|
||||
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
|
||||
|
||||
expected_mask = np.array([[False, True], [True, False]])
|
||||
|
||||
mask = _object_dtype_isnan(X)
|
||||
|
||||
assert_array_equal(mask, expected_mask)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
({}, np.int32), # default behaviour
|
||||
({"maxval": np.iinfo(np.int32).max}, np.int32),
|
||||
({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` depending only on the
|
||||
`max_val` parameter.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
# Arrays dtype is int64 and thus should not be downcasted to int32 without
|
||||
# checking the content of providing maxval.
|
||||
({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
|
||||
# One of the array is int64 and should not be downcasted to int32
|
||||
# for the same reasons.
|
||||
(
|
||||
{
|
||||
"arrays": (
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
np.array([1, 2], dtype=np.int64),
|
||||
)
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# Both arrays are already int32: we can just keep this dtype.
|
||||
(
|
||||
{
|
||||
"arrays": (
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
np.array([1, 2], dtype=np.int32),
|
||||
)
|
||||
},
|
||||
np.int32,
|
||||
),
|
||||
# Arrays should be upcasted to at least int32 precision.
|
||||
({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
|
||||
# Check that `maxval` takes precedence over the arrays and thus upcast to
|
||||
# int64.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([1, 2], dtype=np.int32),
|
||||
"maxval": np.iinfo(np.int32).max + 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_without_checking_contents(
|
||||
params, expected_dtype
|
||||
):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` using the passed
|
||||
arrays but without checking the contents of the arrays.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, expected_dtype",
|
||||
[
|
||||
# empty arrays should always be converted to int32 indices
|
||||
(
|
||||
{
|
||||
"arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
|
||||
"check_contents": True,
|
||||
},
|
||||
np.int32,
|
||||
),
|
||||
# arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
|
||||
# be converted to int32,
|
||||
(
|
||||
{"arrays": np.array([1], dtype=np.int64), "check_contents": True},
|
||||
np.int32,
|
||||
),
|
||||
# otherwise, it should be converted to int64. We need to create a uint32
|
||||
# arrays to accommodate a value > np.iinfo(np.int32).max
|
||||
(
|
||||
{
|
||||
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
|
||||
"check_contents": True,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# maxval should take precedence over the arrays contents and thus upcast to
|
||||
# int64.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([1], dtype=np.int32),
|
||||
"check_contents": True,
|
||||
"maxval": np.iinfo(np.int32).max + 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
# when maxval is small, but check_contents is True and the contents
|
||||
# require np.int64, we still require np.int64 indexing in the end.
|
||||
(
|
||||
{
|
||||
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
|
||||
"check_contents": True,
|
||||
"maxval": 1,
|
||||
},
|
||||
np.int64,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
|
||||
"""Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
|
||||
arrays but as well the contents.
|
||||
"""
|
||||
assert _smallest_admissible_index_dtype(**params) == expected_dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, err_type, err_msg",
|
||||
[
|
||||
(
|
||||
{"maxval": np.iinfo(np.int64).max + 1},
|
||||
ValueError,
|
||||
"is to large to be represented as np.int64",
|
||||
),
|
||||
(
|
||||
{"arrays": np.array([1, 2], dtype=np.float64)},
|
||||
ValueError,
|
||||
"Array dtype float64 is not supported",
|
||||
),
|
||||
({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
|
||||
],
|
||||
)
|
||||
def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
|
||||
"""Check that we raise the proper error message."""
|
||||
with pytest.raises(err_type, match=err_msg):
|
||||
_smallest_admissible_index_dtype(**params)
|
||||
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.neighbors import kneighbors_graph
|
||||
from sklearn.utils.graph import _fix_connected_components
|
||||
|
||||
|
||||
def test_fix_connected_components():
|
||||
# Test that _fix_connected_components reduces the number of component to 1.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
graph = _fix_connected_components(X, graph, n_connected_components, labels)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
|
||||
def test_fix_connected_components_precomputed():
|
||||
# Test that _fix_connected_components accepts precomputed distance matrix.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components > 1
|
||||
|
||||
distances = pairwise_distances(X)
|
||||
graph = _fix_connected_components(
|
||||
distances, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
assert n_connected_components == 1
|
||||
|
||||
# but it does not work with precomputed neighbors graph
|
||||
with pytest.raises(RuntimeError, match="does not work with a sparse"):
|
||||
_fix_connected_components(
|
||||
graph, graph, n_connected_components, labels, metric="precomputed"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_wrong_mode():
|
||||
# Test that the an error is raised if the mode string is incorrect.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
|
||||
with pytest.raises(ValueError, match="Unknown mode"):
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="foo"
|
||||
)
|
||||
|
||||
|
||||
def test_fix_connected_components_connectivity_mode():
|
||||
# Test that the connectivity mode fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="connectivity"
|
||||
)
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
|
||||
def test_fix_connected_components_distance_mode():
|
||||
# Test that the distance mode does not fill new connections with ones.
|
||||
X = np.array([0, 1, 6, 7])[:, None]
|
||||
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
|
||||
assert np.all(graph.data == 1)
|
||||
|
||||
n_connected_components, labels = connected_components(graph)
|
||||
graph = _fix_connected_components(
|
||||
X, graph, n_connected_components, labels, mode="distance"
|
||||
)
|
||||
assert not np.all(graph.data == 1)
|
||||
@@ -0,0 +1,594 @@
|
||||
import warnings
|
||||
from copy import copy
|
||||
from unittest import SkipTest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import sklearn
|
||||
from sklearn.externals._packaging.version import parse as parse_version
|
||||
from sklearn.utils import _safe_indexing, resample, shuffle
|
||||
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
|
||||
from sklearn.utils._indexing import (
|
||||
_determine_key_type,
|
||||
_get_column_indices,
|
||||
_safe_assign,
|
||||
)
|
||||
from sklearn.utils._mocking import MockDataFrame
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
skip_if_array_api_compat_not_configured,
|
||||
)
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
|
||||
|
||||
# toy array
|
||||
X_toy = np.arange(9).reshape((3, 3))
|
||||
|
||||
|
||||
def test_polars_indexing():
|
||||
"""Check _safe_indexing for polars as expected."""
|
||||
pl = pytest.importorskip("polars", minversion="0.18.2")
|
||||
df = pl.DataFrame(
|
||||
{"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
|
||||
)
|
||||
|
||||
from polars.testing import assert_frame_equal
|
||||
|
||||
str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
|
||||
|
||||
for key in str_keys:
|
||||
out = _safe_indexing(df, key, axis=1)
|
||||
assert_frame_equal(df[key], out)
|
||||
|
||||
bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
|
||||
|
||||
for bool_key, str_key in bool_keys:
|
||||
out = _safe_indexing(df, bool_key, axis=1)
|
||||
assert_frame_equal(df[:, str_key], out)
|
||||
|
||||
int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
|
||||
|
||||
for int_key, str_key in int_keys:
|
||||
out = _safe_indexing(df, int_key, axis=1)
|
||||
assert_frame_equal(df[:, str_key], out)
|
||||
|
||||
axis_0_keys = [[0, 1], [1, 3], [3, 2]]
|
||||
for key in axis_0_keys:
|
||||
out = _safe_indexing(df, key, axis=0)
|
||||
assert_frame_equal(df[key], out)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, dtype",
|
||||
[
|
||||
(0, "int"),
|
||||
("0", "str"),
|
||||
(True, "bool"),
|
||||
(np.bool_(True), "bool"),
|
||||
([0, 1, 2], "int"),
|
||||
(["0", "1", "2"], "str"),
|
||||
((0, 1, 2), "int"),
|
||||
(("0", "1", "2"), "str"),
|
||||
(slice(None, None), None),
|
||||
(slice(0, 2), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int32), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.int64), "int"),
|
||||
(np.array([0, 1, 2], dtype=np.uint8), "int"),
|
||||
([True, False], "bool"),
|
||||
((True, False), "bool"),
|
||||
(np.array([True, False]), "bool"),
|
||||
("col_0", "str"),
|
||||
(["col_0", "col_1", "col_2"], "str"),
|
||||
(("col_0", "col_1", "col_2"), "str"),
|
||||
(slice("begin", "end"), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"]), "str"),
|
||||
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
|
||||
],
|
||||
)
|
||||
def test_determine_key_type(key, dtype):
|
||||
assert _determine_key_type(key) == dtype
|
||||
|
||||
|
||||
def test_determine_key_type_error():
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(1.0)
|
||||
|
||||
|
||||
def test_determine_key_type_slice_error():
|
||||
with pytest.raises(TypeError, match="Only array-like or scalar are"):
|
||||
_determine_key_type(slice(0, 2, 1), accept_slice=False)
|
||||
|
||||
|
||||
@skip_if_array_api_compat_not_configured
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
|
||||
)
|
||||
def test_determine_key_type_array_api(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
with sklearn.config_context(array_api_dispatch=True):
|
||||
int_array_key = xp.asarray([1, 2, 3])
|
||||
assert _determine_key_type(int_array_key) == "int"
|
||||
|
||||
bool_array_key = xp.asarray([True, False, True])
|
||||
assert _determine_key_type(bool_array_key) == "bool"
|
||||
|
||||
try:
|
||||
complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j])
|
||||
except TypeError:
|
||||
# Complex numbers are not supported by all Array API libraries.
|
||||
complex_array_key = None
|
||||
|
||||
if complex_array_key is not None:
|
||||
with pytest.raises(ValueError, match="No valid specification of the"):
|
||||
_determine_key_type(complex_array_key)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type", ["list", "array", "sparse", "dataframe", "polars"]
|
||||
)
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
def test_safe_indexing_1d_container(array_type, indices_type):
|
||||
indices = [1, 2]
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices[1] += 1
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
|
||||
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
|
||||
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
|
||||
# validation of the indices
|
||||
# we make a copy because indices is mutable and shared between tests
|
||||
indices_converted = copy(indices)
|
||||
if indices_type == "slice" and isinstance(indices[1], int):
|
||||
indices_converted[1] += 1
|
||||
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices_converted = _convert_container(indices_converted, indices_type)
|
||||
|
||||
if isinstance(indices[0], str) and array_type not in ("dataframe", "polars"):
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported for dataframes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices_converted, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices_converted, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_read_only", [True, False])
|
||||
@pytest.mark.parametrize("indices_read_only", [True, False])
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
|
||||
@pytest.mark.parametrize("indices_type", ["array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
|
||||
)
|
||||
def test_safe_indexing_2d_read_only_axis_1(
|
||||
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
|
||||
):
|
||||
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
if array_read_only:
|
||||
array.setflags(write=False)
|
||||
array = _convert_container(array, array_type)
|
||||
indices = np.array([1, 2])
|
||||
if indices_read_only:
|
||||
indices.setflags(write=False)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
def test_safe_indexing_1d_container_mask(array_type, indices_type):
|
||||
indices = [False] + [True] * 2 + [False] * 6
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = _convert_container(indices, indices_type)
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
|
||||
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
|
||||
@pytest.mark.parametrize(
|
||||
"axis, expected_subset",
|
||||
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
|
||||
)
|
||||
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
indices = [False, True, True]
|
||||
indices = _convert_container(indices, indices_type)
|
||||
|
||||
subset = _safe_indexing(array, indices, axis=axis)
|
||||
assert_allclose_dense_sparse(
|
||||
subset, _convert_container(expected_subset, array_type)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[
|
||||
("list", "list"),
|
||||
("array", "array"),
|
||||
("sparse", "sparse"),
|
||||
("dataframe", "series"),
|
||||
("polars", "polars_series"),
|
||||
],
|
||||
)
|
||||
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
|
||||
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
expected_array = _convert_container([7, 8, 9], expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
|
||||
def test_safe_indexing_1d_scalar(array_type):
|
||||
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
|
||||
indices = 2
|
||||
subset = _safe_indexing(array, indices, axis=0)
|
||||
assert subset == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_type, expected_output_type",
|
||||
[
|
||||
("array", "array"),
|
||||
("sparse", "sparse"),
|
||||
("dataframe", "series"),
|
||||
("polars", "polars_series"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("indices", [2, "col_2"])
|
||||
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
|
||||
columns_name = ["col_0", "col_1", "col_2"]
|
||||
array = _convert_container(
|
||||
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
|
||||
)
|
||||
|
||||
if isinstance(indices, str) and array_type not in ("dataframe", "polars"):
|
||||
err_msg = (
|
||||
"Specifying the columns using strings is only supported for dataframes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=1)
|
||||
else:
|
||||
subset = _safe_indexing(array, indices, axis=1)
|
||||
expected_output = [3, 6, 9]
|
||||
if expected_output_type == "sparse":
|
||||
# sparse matrix are keeping the 2D shape
|
||||
expected_output = [[3], [6], [9]]
|
||||
expected_array = _convert_container(expected_output, expected_output_type)
|
||||
assert_allclose_dense_sparse(subset, expected_array)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
|
||||
def test_safe_indexing_None_axis_0(array_type):
|
||||
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
|
||||
X_subset = _safe_indexing(X, None, axis=0)
|
||||
assert_allclose_dense_sparse(X_subset, X)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_matching_cols_error():
|
||||
pd = pytest.importorskip("pandas")
|
||||
err_msg = "No valid specification of the columns."
|
||||
X = pd.DataFrame(X_toy)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, [1.0], axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [None, 3])
|
||||
def test_safe_indexing_error_axis(axis):
|
||||
with pytest.raises(ValueError, match="'axis' should be either 0"):
|
||||
_safe_indexing(X_toy, [0, 1], axis=axis)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X_constructor", ["array", "series", "polars_series"])
|
||||
def test_safe_indexing_1d_array_error(X_constructor):
|
||||
# check that we are raising an error if the array-like passed is 1D and
|
||||
# we try to index on the 2nd dimension
|
||||
X = list(range(5))
|
||||
if X_constructor == "array":
|
||||
X_constructor = np.asarray(X)
|
||||
elif X_constructor == "series":
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_constructor = pd.Series(X)
|
||||
elif X_constructor == "polars_series":
|
||||
pl = pytest.importorskip("polars")
|
||||
X_constructor = pl.Series(values=X)
|
||||
|
||||
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X_constructor, [0, 1], axis=1)
|
||||
|
||||
|
||||
def test_safe_indexing_container_axis_0_unsupported_type():
|
||||
indices = ["col_1", "col_2"]
|
||||
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
|
||||
err_msg = "String indexing is not supported with 'axis=0'"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(array, indices, axis=0)
|
||||
|
||||
|
||||
def test_safe_indexing_pandas_no_settingwithcopy_warning():
|
||||
# Using safe_indexing with an array-like indexer gives a copy of the
|
||||
# DataFrame -> ensure it doesn't raise a warning if modified
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
pd_version = parse_version(pd.__version__)
|
||||
pd_base_version = parse_version(pd_version.base_version)
|
||||
|
||||
if pd_base_version >= parse_version("3"):
|
||||
raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
|
||||
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
|
||||
subset = _safe_indexing(X, [0, 1], axis=0)
|
||||
if hasattr(pd.errors, "SettingWithCopyWarning"):
|
||||
SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
|
||||
else:
|
||||
# backward compatibility for pandas < 1.5
|
||||
SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("error", SettingWithCopyWarning)
|
||||
subset.iloc[0, 0] = 10
|
||||
# The original dataframe is unaffected by the assignment on the subset:
|
||||
assert X.iloc[0, 0] == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
|
||||
def test_safe_indexing_list_axis_1_unsupported(indices):
|
||||
"""Check that we raise a ValueError when axis=1 with input as list."""
|
||||
X = [[1, 2], [4, 5], [7, 8]]
|
||||
err_msg = "axis=1 is not supported for lists"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_indexing(X, indices, axis=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
|
||||
def test_safe_assign(array_type):
|
||||
"""Check that `_safe_assign` works as expected."""
|
||||
rng = np.random.RandomState(0)
|
||||
X_array = rng.randn(10, 5)
|
||||
|
||||
row_indexer = [1, 2]
|
||||
values = rng.randn(len(row_indexer), X_array.shape[1])
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, row_indexer=row_indexer)
|
||||
|
||||
assigned_portion = _safe_indexing(X, row_indexer, axis=0)
|
||||
assert_allclose_dense_sparse(
|
||||
assigned_portion, _convert_container(values, array_type)
|
||||
)
|
||||
|
||||
column_indexer = [1, 2]
|
||||
values = rng.randn(X_array.shape[0], len(column_indexer))
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, column_indexer=column_indexer)
|
||||
|
||||
assigned_portion = _safe_indexing(X, column_indexer, axis=1)
|
||||
assert_allclose_dense_sparse(
|
||||
assigned_portion, _convert_container(values, array_type)
|
||||
)
|
||||
|
||||
row_indexer, column_indexer = None, None
|
||||
values = rng.randn(*X.shape)
|
||||
X = _convert_container(X_array, array_type)
|
||||
_safe_assign(X, values, column_indexer=column_indexer)
|
||||
|
||||
assert_allclose_dense_sparse(X, _convert_container(values, array_type))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, err_msg",
|
||||
[
|
||||
(10, r"all features must be in \[0, 2\]"),
|
||||
("whatever", "A given column is not a column of the dataframe"),
|
||||
(object(), "No valid specification of the columns"),
|
||||
],
|
||||
)
|
||||
def test_get_column_indices_error(key, err_msg):
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
|
||||
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_column_indices(X_df, key)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
|
||||
)
|
||||
def test_get_column_indices_pandas_nonunique_columns_error(key):
|
||||
pd = pytest.importorskip("pandas")
|
||||
toy = np.zeros((1, 5), dtype=int)
|
||||
columns = ["col1", "col1", "col2", "col3", "col2"]
|
||||
X = pd.DataFrame(toy, columns=columns)
|
||||
|
||||
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
_get_column_indices(X, key)
|
||||
assert str(exc_info.value) == err_msg
|
||||
|
||||
|
||||
def test_get_column_indices_interchange():
|
||||
"""Check _get_column_indices for edge cases with the interchange"""
|
||||
pd = pytest.importorskip("pandas", minversion="1.5")
|
||||
|
||||
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
|
||||
|
||||
# Hide the fact that this is a pandas dataframe to trigger the dataframe protocol
|
||||
# code path.
|
||||
class MockDataFrame:
|
||||
def __init__(self, df):
|
||||
self._df = df
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._df, name)
|
||||
|
||||
df_mocked = MockDataFrame(df)
|
||||
|
||||
key_results = [
|
||||
(slice(1, None), [1, 2]),
|
||||
(slice(None, 2), [0, 1]),
|
||||
(slice(1, 2), [1]),
|
||||
(["b", "c"], [1, 2]),
|
||||
(slice("a", "b"), [0, 1]),
|
||||
(slice("a", None), [0, 1, 2]),
|
||||
(slice(None, "a"), [0]),
|
||||
(["c", "a"], [2, 0]),
|
||||
([], []),
|
||||
]
|
||||
for key, result in key_results:
|
||||
assert _get_column_indices(df_mocked, key) == result
|
||||
|
||||
msg = "A given column is not a column of the dataframe"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_get_column_indices(df_mocked, ["not_a_column"])
|
||||
|
||||
msg = "key.step must be 1 or None"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
_get_column_indices(df_mocked, slice("a", None, 2))
|
||||
|
||||
|
||||
def test_resample():
|
||||
# Border case not worth mentioning in doctests
|
||||
assert resample() is None
|
||||
|
||||
# Check that invalid arguments yield ValueError
|
||||
with pytest.raises(ValueError):
|
||||
resample([0], [0, 1])
|
||||
with pytest.raises(ValueError):
|
||||
resample([0, 1], [0, 1], replace=False, n_samples=3)
|
||||
|
||||
# Issue:6581, n_samples can be more when replace is True (default).
|
||||
assert len(resample([1, 2], n_samples=5)) == 5
|
||||
|
||||
|
||||
def test_resample_stratified():
|
||||
# Make sure resample can stratify
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
p = 0.9
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.binomial(1, p, size=n_samples)
|
||||
|
||||
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
|
||||
assert np.all(y_not_stratified == 1)
|
||||
|
||||
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
|
||||
assert not np.all(y_stratified == 1)
|
||||
assert np.sum(y_stratified) == 9 # all 1s, one 0
|
||||
|
||||
|
||||
def test_resample_stratified_replace():
|
||||
# Make sure stratified resampling supports the replace parameter
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
X_no_replace, _ = resample(
|
||||
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
|
||||
)
|
||||
assert np.unique(X_replace).shape[0] < 50
|
||||
assert np.unique(X_no_replace).shape[0] == 50
|
||||
|
||||
# make sure n_samples can be greater than X.shape[0] if we sample with
|
||||
# replacement
|
||||
X_replace, _ = resample(
|
||||
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
|
||||
)
|
||||
assert X_replace.shape[0] == 1000
|
||||
assert np.unique(X_replace).shape[0] == 100
|
||||
|
||||
|
||||
def test_resample_stratify_2dy():
|
||||
# Make sure y can be 2d when stratifying
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 1))
|
||||
y = rng.randint(0, 2, size=(n_samples, 2))
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
|
||||
assert y.ndim == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_resample_stratify_sparse_error(csr_container):
|
||||
# resample must be ndarray
|
||||
rng = np.random.RandomState(0)
|
||||
n_samples = 100
|
||||
X = rng.normal(size=(n_samples, 2))
|
||||
y = rng.randint(0, 2, size=n_samples)
|
||||
stratify = csr_container(y.reshape(-1, 1))
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
|
||||
|
||||
|
||||
def test_shuffle_on_ndim_equals_three():
|
||||
def to_tuple(A): # to make the inner arrays hashable
|
||||
return tuple(tuple(tuple(C) for C in B) for B in A)
|
||||
|
||||
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
|
||||
S = set(to_tuple(A))
|
||||
shuffle(A) # shouldn't raise a ValueError for dim = 3
|
||||
assert set(to_tuple(A)) == S
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_shuffle_dont_convert_to_array(csc_container):
|
||||
# Check that shuffle does not try to convert to numpy arrays with float
|
||||
# dtypes can let any indexable datastructure pass-through.
|
||||
a = ["a", "b", "c"]
|
||||
b = np.array(["a", "b", "c"], dtype=object)
|
||||
c = [1, 2, 3]
|
||||
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
|
||||
e = csc_container(np.arange(6).reshape(3, 2))
|
||||
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
|
||||
|
||||
assert a_s == ["c", "b", "a"]
|
||||
assert type(a_s) == list # noqa: E721
|
||||
|
||||
assert_array_equal(b_s, ["c", "b", "a"])
|
||||
assert b_s.dtype == object
|
||||
|
||||
assert c_s == [3, 2, 1]
|
||||
assert type(c_s) == list # noqa: E721
|
||||
|
||||
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
|
||||
assert type(d_s) == MockDataFrame # noqa: E721
|
||||
|
||||
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
|
||||
@@ -0,0 +1,19 @@
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._mask import safe_mask
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
from sklearn.utils.validation import check_random_state
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_safe_mask(csr_container):
|
||||
random_state = check_random_state(0)
|
||||
X = random_state.rand(5, 4)
|
||||
X_csr = csr_container(X)
|
||||
mask = [False, False, True, True, True]
|
||||
|
||||
mask = safe_mask(X, mask)
|
||||
assert X[mask].shape[0] == 3
|
||||
|
||||
mask = safe_mask(X_csr, mask)
|
||||
assert X_csr[mask].shape[0] == 3
|
||||
@@ -0,0 +1,63 @@
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils.metaestimators import available_if
|
||||
|
||||
|
||||
class AvailableParameterEstimator:
|
||||
"""This estimator's `available` parameter toggles the presence of a method"""
|
||||
|
||||
def __init__(self, available=True, return_value=1):
|
||||
self.available = available
|
||||
self.return_value = return_value
|
||||
|
||||
@available_if(lambda est: est.available)
|
||||
def available_func(self):
|
||||
"""This is a mock available_if function"""
|
||||
return self.return_value
|
||||
|
||||
|
||||
def test_available_if_docstring():
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.__dict__["available_func"].__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator.available_func.__doc__
|
||||
)
|
||||
assert "This is a mock available_if function" in str(
|
||||
AvailableParameterEstimator().available_func.__doc__
|
||||
)
|
||||
|
||||
|
||||
def test_available_if():
|
||||
assert hasattr(AvailableParameterEstimator(), "available_func")
|
||||
assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
|
||||
|
||||
|
||||
def test_available_if_unbound_method():
|
||||
# This is a non regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/20614
|
||||
# to make sure that decorated functions can be used as an unbound method,
|
||||
# for instance when monkeypatching.
|
||||
est = AvailableParameterEstimator()
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
est = AvailableParameterEstimator(available=False)
|
||||
with pytest.raises(
|
||||
AttributeError,
|
||||
match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
|
||||
):
|
||||
AvailableParameterEstimator.available_func(est)
|
||||
|
||||
|
||||
def test_available_if_methods_can_be_pickled():
|
||||
"""Check that available_if methods can be pickled.
|
||||
|
||||
Non-regression test for #21344.
|
||||
"""
|
||||
return_value = 10
|
||||
est = AvailableParameterEstimator(available=True, return_value=return_value)
|
||||
pickled_bytes = pickle.dumps(est.available_func)
|
||||
unpickled_func = pickle.loads(pickled_bytes)
|
||||
assert unpickled_func() == return_value
|
||||
@@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._missing import is_scalar_nan
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value, result",
|
||||
[
|
||||
(float("nan"), True),
|
||||
(np.nan, True),
|
||||
(float(np.nan), True),
|
||||
(np.float32(np.nan), True),
|
||||
(np.float64(np.nan), True),
|
||||
(0, False),
|
||||
(0.0, False),
|
||||
(None, False),
|
||||
("", False),
|
||||
("nan", False),
|
||||
([np.nan], False),
|
||||
(9867966753463435747313673, False), # Python int that overflows with C type
|
||||
],
|
||||
)
|
||||
def test_is_scalar_nan(value, result):
|
||||
assert is_scalar_nan(value) is result
|
||||
# make sure that we are returning a Python bool
|
||||
assert isinstance(is_scalar_nan(value), bool)
|
||||
@@ -0,0 +1,205 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils import _safe_indexing, check_array
|
||||
from sklearn.utils._mocking import (
|
||||
CheckingClassifier,
|
||||
_MockEstimatorOnOffPrediction,
|
||||
)
|
||||
from sklearn.utils._testing import _convert_container
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def iris():
|
||||
return load_iris(return_X_y=True)
|
||||
|
||||
|
||||
def _success(x):
|
||||
return True
|
||||
|
||||
|
||||
def _fail(x):
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{},
|
||||
{"check_X": _success},
|
||||
{"check_y": _success},
|
||||
{"check_X": _success, "check_y": _success},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_success(iris, kwargs):
|
||||
X, y = iris
|
||||
CheckingClassifier(**kwargs).fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"check_X": _fail},
|
||||
{"check_y": _fail},
|
||||
{"check_X": _success, "check_y": _fail},
|
||||
{"check_X": _fail, "check_y": _success},
|
||||
{"check_X": _fail, "check_y": _fail},
|
||||
],
|
||||
)
|
||||
def test_check_on_fit_fail(iris, kwargs):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(**kwargs)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_success(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_check_X_on_predict_fail(iris, pred_func):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(check_X=_success).fit(X, y)
|
||||
clf.set_params(check_X=_fail)
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, pred_func)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
|
||||
def test_checking_classifier(iris, input_type):
|
||||
# Check that the CheckingClassifier outputs what we expect
|
||||
X, y = iris
|
||||
X = _convert_container(X, input_type)
|
||||
clf = CheckingClassifier()
|
||||
clf.fit(X, y)
|
||||
|
||||
assert_array_equal(clf.classes_, np.unique(y))
|
||||
assert len(clf.classes_) == 3
|
||||
assert clf.n_features_in_ == 4
|
||||
|
||||
y_pred = clf.predict(X)
|
||||
assert all(pred in clf.classes_ for pred in y_pred)
|
||||
|
||||
assert clf.score(X) == pytest.approx(0)
|
||||
clf.set_params(foo_param=10)
|
||||
assert clf.fit(X, y).score(X) == pytest.approx(1)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (150, 3)
|
||||
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (150, 3)
|
||||
|
||||
# check the shape in case of binary classification
|
||||
first_2_classes = np.logical_or(y == 0, y == 1)
|
||||
X = _safe_indexing(X, first_2_classes)
|
||||
y = _safe_indexing(y, first_2_classes)
|
||||
clf.fit(X, y)
|
||||
|
||||
y_proba = clf.predict_proba(X)
|
||||
assert y_proba.shape == (100, 2)
|
||||
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
|
||||
|
||||
y_decision = clf.decision_function(X)
|
||||
assert y_decision.shape == (100,)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_checking_classifier_with_params(iris, csr_container):
|
||||
X, y = iris
|
||||
X_sparse = csr_container(X)
|
||||
|
||||
clf = CheckingClassifier(check_X=sparse.issparse)
|
||||
with pytest.raises(AssertionError):
|
||||
clf.fit(X, y)
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=check_array, check_X_params={"accept_sparse": False}
|
||||
)
|
||||
clf.fit(X, y)
|
||||
with pytest.raises(TypeError, match="Sparse data was passed"):
|
||||
clf.fit(X_sparse, y)
|
||||
|
||||
|
||||
def test_checking_classifier_fit_params(iris):
|
||||
# check the error raised when the number of samples is not the one expected
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
sample_weight = np.ones(len(X) // 2)
|
||||
|
||||
msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
|
||||
with pytest.raises(ValueError) as exc:
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
assert exc.value.args[0] == msg
|
||||
|
||||
|
||||
def test_checking_classifier_missing_fit_params(iris):
|
||||
X, y = iris
|
||||
clf = CheckingClassifier(expected_sample_weight=True)
|
||||
err_msg = "Expected sample_weight to be passed"
|
||||
with pytest.raises(AssertionError, match=err_msg):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"methods_to_check",
|
||||
[["predict"], ["predict", "predict_proba"]],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"predict_method", ["predict", "predict_proba", "decision_function", "score"]
|
||||
)
|
||||
def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
|
||||
# check that methods_to_check allows to bypass checks
|
||||
X, y = iris
|
||||
|
||||
clf = CheckingClassifier(
|
||||
check_X=sparse.issparse,
|
||||
methods_to_check=methods_to_check,
|
||||
)
|
||||
|
||||
clf.fit(X, y)
|
||||
if predict_method in methods_to_check:
|
||||
with pytest.raises(AssertionError):
|
||||
getattr(clf, predict_method)(X)
|
||||
else:
|
||||
getattr(clf, predict_method)(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_methods",
|
||||
[
|
||||
["predict"],
|
||||
["predict", "predict_proba"],
|
||||
["predict", "decision_function"],
|
||||
["predict", "predict_proba", "decision_function"],
|
||||
],
|
||||
)
|
||||
def test_mock_estimator_on_off_prediction(iris, response_methods):
|
||||
X, y = iris
|
||||
estimator = _MockEstimatorOnOffPrediction(response_methods=response_methods)
|
||||
|
||||
estimator.fit(X, y)
|
||||
assert hasattr(estimator, "classes_")
|
||||
assert_array_equal(estimator.classes_, np.unique(y))
|
||||
|
||||
possible_responses = ["predict", "predict_proba", "decision_function"]
|
||||
for response in possible_responses:
|
||||
if response in response_methods:
|
||||
assert hasattr(estimator, response)
|
||||
assert getattr(estimator, response)(X) == response
|
||||
else:
|
||||
assert not hasattr(estimator, response)
|
||||
@@ -0,0 +1,613 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from sklearn import config_context, datasets
|
||||
from sklearn.model_selection import ShuffleSplit
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
|
||||
from sklearn.utils._testing import (
|
||||
_array_api_for_tests,
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
assert_array_almost_equal,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.estimator_checks import _NotAnArray
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
DOK_CONTAINERS,
|
||||
LIL_CONTAINERS,
|
||||
)
|
||||
from sklearn.utils.metaestimators import _safe_split
|
||||
from sklearn.utils.multiclass import (
|
||||
_ovr_decision_function,
|
||||
check_classification_targets,
|
||||
class_distribution,
|
||||
is_multilabel,
|
||||
type_of_target,
|
||||
unique_labels,
|
||||
)
|
||||
|
||||
multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
|
||||
multilabel_explicit_zero[:, 0] = 0
|
||||
|
||||
|
||||
def _generate_sparse(
|
||||
data,
|
||||
sparse_containers=tuple(
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
),
|
||||
dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
|
||||
):
|
||||
return [
|
||||
sparse_container(data, dtype=dtype)
|
||||
for sparse_container in sparse_containers
|
||||
for dtype in dtypes
|
||||
]
|
||||
|
||||
|
||||
EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
# valid when the data is formatted as sparse or dense, identified
|
||||
# by CSR format when the testing takes place
|
||||
*_generate_sparse(
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
sparse_containers=CSR_CONTAINERS,
|
||||
dtypes=(int,),
|
||||
),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
*_generate_sparse(
|
||||
multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
|
||||
),
|
||||
*_generate_sparse([[0, 1], [1, 0]]),
|
||||
*_generate_sparse([[0, 0], [0, 0]]),
|
||||
*_generate_sparse([[0, 1]]),
|
||||
# Only valid when data is dense
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
["a", "b", "c"],
|
||||
np.array(["a", "b", "c"]),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
np.array(["a", "b", "c"], dtype=object),
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
[["a", "b"], ["c", "d"]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
*_generate_sparse(
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(int, np.int8, np.uint8, float, np.float32),
|
||||
),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]]),
|
||||
np.array([["a", "b"], ["c", "d"]], dtype=object),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
["a"],
|
||||
["a", "b"],
|
||||
["abc", "def"],
|
||||
np.array(["abc", "def"]),
|
||||
["a", "b"],
|
||||
np.array(["abc", "def"], dtype=object),
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5], [0.5, 0]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
*_generate_sparse(
|
||||
[[0, 0.5]],
|
||||
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
|
||||
dtypes=(float, np.float32),
|
||||
),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
np.array([[]], dtype=object),
|
||||
[()],
|
||||
# sequence of sequences that weren't supported even before deprecation
|
||||
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
|
||||
[np.array([]), np.array([1, 2, 3])],
|
||||
[{1, 2, 3}, {1, 2}],
|
||||
[frozenset([1, 2, 3]), frozenset([1, 2])],
|
||||
# and also confusable as sequences of sequences
|
||||
[{0: "a", 1: "b"}, {0: "a"}],
|
||||
# ndim 0
|
||||
np.array(0),
|
||||
# empty second dimension
|
||||
np.array([[], []]),
|
||||
# 3d
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
ARRAY_API_EXAMPLES = {
|
||||
"multilabel-indicator": [
|
||||
np.random.RandomState(42).randint(2, size=(10, 10)),
|
||||
[[0, 1], [1, 0]],
|
||||
[[0, 1]],
|
||||
multilabel_explicit_zero,
|
||||
[[0, 0], [0, 0]],
|
||||
[[-1, 1], [1, -1]],
|
||||
np.array([[-1, 1], [1, -1]]),
|
||||
np.array([[-3, 3], [3, -3]]),
|
||||
_NotAnArray(np.array([[-3, 3], [3, -3]])),
|
||||
],
|
||||
"multiclass": [
|
||||
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
|
||||
np.array([1, 0, 2]),
|
||||
np.array([1, 0, 2], dtype=np.int8),
|
||||
np.array([1, 0, 2], dtype=np.uint8),
|
||||
np.array([1, 0, 2], dtype=float),
|
||||
np.array([1, 0, 2], dtype=np.float32),
|
||||
np.array([[1], [0], [2]]),
|
||||
_NotAnArray(np.array([1, 0, 2])),
|
||||
[0, 1, 2],
|
||||
],
|
||||
"multiclass-multioutput": [
|
||||
[[1, 0, 2, 2], [1, 4, 2, 4]],
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
|
||||
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
|
||||
np.array([[1, 0, 2]]),
|
||||
_NotAnArray(np.array([[1, 0, 2]])),
|
||||
],
|
||||
"binary": [
|
||||
[0, 1],
|
||||
[1, 1],
|
||||
[],
|
||||
[0],
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
|
||||
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
|
||||
np.array([[0], [1]]),
|
||||
_NotAnArray(np.array([[0], [1]])),
|
||||
[1, -1],
|
||||
[3, 5],
|
||||
],
|
||||
"continuous": [
|
||||
[1e-5],
|
||||
[0, 0.5],
|
||||
np.array([[0], [0.5]]),
|
||||
np.array([[0], [0.5]], dtype=np.float32),
|
||||
],
|
||||
"continuous-multioutput": [
|
||||
np.array([[0, 0.5], [0.5, 0]]),
|
||||
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
|
||||
np.array([[0, 0.5]]),
|
||||
],
|
||||
"unknown": [
|
||||
[[]],
|
||||
[()],
|
||||
np.array(0),
|
||||
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
NON_ARRAY_LIKE_EXAMPLES = [
|
||||
{1, 2, 3},
|
||||
{0: "a", 1: "b"},
|
||||
{0: [5], 1: [5]},
|
||||
"abc",
|
||||
frozenset([1, 2, 3]),
|
||||
None,
|
||||
]
|
||||
|
||||
MULTILABEL_SEQUENCES = [
|
||||
[[1], [2], [0, 1]],
|
||||
[(), (2), (0, 1)],
|
||||
np.array([[], [1, 2]], dtype="object"),
|
||||
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
|
||||
]
|
||||
|
||||
|
||||
def test_unique_labels():
|
||||
# Empty iterable
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels()
|
||||
|
||||
# Multiclass problem
|
||||
assert_array_equal(unique_labels(range(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
|
||||
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
|
||||
|
||||
# Multilabel indicator
|
||||
assert_array_equal(
|
||||
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
|
||||
)
|
||||
|
||||
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
|
||||
|
||||
# Several arrays passed
|
||||
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
|
||||
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
|
||||
|
||||
# Border line case with binary indicator matrix
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([4, 0, 2], np.ones((5, 5)))
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
|
||||
|
||||
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
|
||||
|
||||
|
||||
def test_unique_labels_non_specific():
|
||||
# Test unique_labels with a variety of collected examples
|
||||
|
||||
# Smoke test for all supported format
|
||||
for format in ["binary", "multiclass", "multilabel-indicator"]:
|
||||
for y in EXAMPLES[format]:
|
||||
unique_labels(y)
|
||||
|
||||
# We don't support those format at the moment
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
for y_type in [
|
||||
"unknown",
|
||||
"continuous",
|
||||
"continuous-multioutput",
|
||||
"multiclass-multioutput",
|
||||
]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(example)
|
||||
|
||||
|
||||
def test_unique_labels_mixed_types():
|
||||
# Mix with binary or multiclass and multilabel
|
||||
mix_clf_format = product(
|
||||
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
|
||||
)
|
||||
|
||||
for y_multilabel, y_multiclass in mix_clf_format:
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multiclass, y_multilabel)
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(y_multilabel, y_multiclass)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([[1, 2]], [["a", "d"]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels(["1", 2])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", 2], [1, 3]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
unique_labels([["1", "2"], [2, 3]])
|
||||
|
||||
|
||||
def test_is_multilabel():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
|
||||
for example in group_examples:
|
||||
# Only mark explicitly defined sparse examples as valid sparse
|
||||
# multilabel-indicators
|
||||
sparse_exp = dense_exp and issparse(example)
|
||||
|
||||
if issparse(example) or (
|
||||
hasattr(example, "__array__")
|
||||
and np.asarray(example).ndim == 2
|
||||
and np.asarray(example).dtype.kind in "biuf"
|
||||
and np.asarray(example).shape[1] > 0
|
||||
):
|
||||
examples_sparse = [
|
||||
sparse_container(example)
|
||||
for sparse_container in (
|
||||
COO_CONTAINERS
|
||||
+ CSC_CONTAINERS
|
||||
+ CSR_CONTAINERS
|
||||
+ DOK_CONTAINERS
|
||||
+ LIL_CONTAINERS
|
||||
)
|
||||
]
|
||||
for exmpl_sparse in examples_sparse:
|
||||
assert sparse_exp == is_multilabel(
|
||||
exmpl_sparse
|
||||
), f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
|
||||
|
||||
# Densify sparse examples before testing
|
||||
if issparse(example):
|
||||
example = example.toarray()
|
||||
|
||||
assert dense_exp == is_multilabel(
|
||||
example
|
||||
), f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_namespace, device, dtype_name",
|
||||
yield_namespace_device_dtype_combinations(),
|
||||
)
|
||||
def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
|
||||
xp = _array_api_for_tests(array_namespace, device)
|
||||
|
||||
for group, group_examples in ARRAY_API_EXAMPLES.items():
|
||||
dense_exp = group == "multilabel-indicator"
|
||||
for example in group_examples:
|
||||
if np.asarray(example).dtype.kind == "f":
|
||||
example = np.asarray(example, dtype=dtype_name)
|
||||
else:
|
||||
example = np.asarray(example)
|
||||
example = xp.asarray(example, device=device)
|
||||
|
||||
with config_context(array_api_dispatch=True):
|
||||
assert dense_exp == is_multilabel(
|
||||
example
|
||||
), f"is_multilabel({example!r}) should be {dense_exp}"
|
||||
|
||||
|
||||
def test_check_classification_targets():
|
||||
for y_type in EXAMPLES.keys():
|
||||
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
|
||||
for example in EXAMPLES[y_type]:
|
||||
msg = "Unknown label type: "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
check_classification_targets(example)
|
||||
else:
|
||||
for example in EXAMPLES[y_type]:
|
||||
check_classification_targets(example)
|
||||
|
||||
|
||||
# @ignore_warnings
|
||||
def test_type_of_target():
|
||||
for group, group_examples in EXAMPLES.items():
|
||||
for example in group_examples:
|
||||
assert (
|
||||
type_of_target(example) == group
|
||||
), "type_of_target(%r) should be %r, got %r" % (
|
||||
example,
|
||||
group,
|
||||
type_of_target(example),
|
||||
)
|
||||
|
||||
for example in NON_ARRAY_LIKE_EXAMPLES:
|
||||
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
|
||||
with pytest.raises(ValueError, match=msg_regex):
|
||||
type_of_target(example)
|
||||
|
||||
for example in MULTILABEL_SEQUENCES:
|
||||
msg = (
|
||||
"You appear to be using a legacy multi-label data "
|
||||
"representation. Sequence of sequences are no longer supported;"
|
||||
" use a binary array or sparse matrix instead."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(example)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_sparse():
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
|
||||
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
type_of_target(y)
|
||||
|
||||
|
||||
def test_type_of_target_pandas_nullable():
|
||||
"""Check that type_of_target works with pandas nullable dtypes."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
for dtype in ["Int32", "Float32"]:
|
||||
y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
|
||||
assert type_of_target(y_true) == "multiclass"
|
||||
|
||||
y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
|
||||
assert type_of_target(y_true) == "binary"
|
||||
|
||||
y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
|
||||
assert type_of_target(y_true) == "continuous-multioutput"
|
||||
|
||||
y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multilabel-indicator"
|
||||
|
||||
y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
|
||||
assert type_of_target(y_true) == "multiclass-multioutput"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_unique_labels_pandas_nullable(dtype):
|
||||
"""Checks that unique_labels work with pandas nullable dtypes.
|
||||
|
||||
Non-regression test for gh-25634.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
|
||||
y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
|
||||
|
||||
labels = unique_labels(y_true, y_predicted)
|
||||
assert_array_equal(labels, [0, 1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
def test_class_distribution(csc_container):
|
||||
y = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 2, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
[4, 2, 0, 1],
|
||||
[2, 0, 0, 1],
|
||||
[1, 3, 0, 1],
|
||||
]
|
||||
)
|
||||
# Define the sparse matrix with a mix of implicit and explicit zeros
|
||||
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
|
||||
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
|
||||
indptr = np.array([0, 6, 11, 11, 17])
|
||||
y_sp = csc_container((data, indices, indptr), shape=(6, 4))
|
||||
|
||||
classes, n_classes, class_prior = class_distribution(y)
|
||||
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
|
||||
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
|
||||
n_classes_expected = [3, 3, 1, 1]
|
||||
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
# Test again with explicit sample weights
|
||||
(classes, n_classes, class_prior) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
|
||||
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
|
||||
)
|
||||
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
|
||||
|
||||
for k in range(y.shape[1]):
|
||||
assert_array_almost_equal(classes[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
|
||||
|
||||
assert_array_almost_equal(classes_sp[k], classes_expected[k])
|
||||
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
|
||||
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
|
||||
|
||||
|
||||
def test_safe_split_with_precomputed_kernel():
|
||||
clf = SVC()
|
||||
clfp = SVC(kernel="precomputed")
|
||||
|
||||
iris = datasets.load_iris()
|
||||
X, y = iris.data, iris.target
|
||||
K = np.dot(X, X.T)
|
||||
|
||||
cv = ShuffleSplit(test_size=0.25, random_state=0)
|
||||
train, test = list(cv.split(X))[0]
|
||||
|
||||
X_train, y_train = _safe_split(clf, X, y, train)
|
||||
K_train, y_train2 = _safe_split(clfp, K, y, train)
|
||||
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
|
||||
assert_array_almost_equal(y_train, y_train2)
|
||||
|
||||
X_test, y_test = _safe_split(clf, X, y, test, train)
|
||||
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
|
||||
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
|
||||
assert_array_almost_equal(y_test, y_test2)
|
||||
|
||||
|
||||
def test_ovr_decision_function():
|
||||
# test properties for ovr decision function
|
||||
|
||||
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
|
||||
|
||||
confidences = np.array(
|
||||
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
|
||||
)
|
||||
|
||||
n_classes = 3
|
||||
|
||||
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
|
||||
|
||||
# check that the decision values are within 0.5 range of the votes
|
||||
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
|
||||
|
||||
assert_allclose(votes, dec_values, atol=0.5)
|
||||
|
||||
# check that the prediction are what we expect
|
||||
# highest vote or highest confidence if there is a tie.
|
||||
# for the second sample we have a tie (should be won by 1)
|
||||
expected_prediction = np.array([2, 1, 2, 2])
|
||||
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
|
||||
|
||||
# third and fourth sample have the same vote but third sample
|
||||
# has higher confidence, this should reflect on the decision values
|
||||
assert dec_values[2, 2] > dec_values[3, 2]
|
||||
|
||||
# assert subset invariance.
|
||||
dec_values_one = [
|
||||
_ovr_decision_function(
|
||||
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
|
||||
)[0]
|
||||
for i in range(4)
|
||||
]
|
||||
|
||||
assert_allclose(dec_values, dec_values_one, atol=1e-6)
|
||||
|
||||
|
||||
# TODO(1.7): Change to ValueError when byte labels is deprecated.
|
||||
@pytest.mark.parametrize("input_type", ["list", "array"])
|
||||
def test_labels_in_bytes_format(input_type):
|
||||
# check that we raise an error with bytes encoded labels
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16980
|
||||
target = _convert_container([b"a", b"b"], input_type)
|
||||
err_msg = (
|
||||
"Support for labels represented as bytes is deprecated in v1.5 and will"
|
||||
" error in v1.7. Convert the labels to a string or integer format."
|
||||
)
|
||||
with pytest.warns(FutureWarning, match=err_msg):
|
||||
type_of_target(target)
|
||||
@@ -0,0 +1,74 @@
|
||||
# Author: Olivier Grisel <olivier.grisel@ensta.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
|
||||
from sklearn.utils.murmurhash import murmurhash3_32
|
||||
|
||||
|
||||
def test_mmhash3_int():
|
||||
assert murmurhash3_32(3) == 847579505
|
||||
assert murmurhash3_32(3, seed=0) == 847579505
|
||||
assert murmurhash3_32(3, seed=42) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
|
||||
|
||||
assert murmurhash3_32(3, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
|
||||
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
|
||||
|
||||
|
||||
def test_mmhash3_int_array():
|
||||
rng = np.random.RandomState(42)
|
||||
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
|
||||
keys = keys.reshape((3, 2, 1))
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed), expected)
|
||||
|
||||
for seed in [0, 42]:
|
||||
expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
|
||||
expected = expected.reshape(keys.shape)
|
||||
assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
|
||||
|
||||
|
||||
def test_mmhash3_bytes():
|
||||
assert murmurhash3_32(b"foo", 0) == -156908512
|
||||
assert murmurhash3_32(b"foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_mmhash3_unicode():
|
||||
assert murmurhash3_32("foo", 0) == -156908512
|
||||
assert murmurhash3_32("foo", 42) == -1322301282
|
||||
|
||||
assert murmurhash3_32("foo", 0, positive=True) == 4138058784
|
||||
assert murmurhash3_32("foo", 42, positive=True) == 2972666014
|
||||
|
||||
|
||||
def test_no_collision_on_byte_range():
|
||||
previous_hashes = set()
|
||||
for i in range(100):
|
||||
h = murmurhash3_32(" " * i, 0)
|
||||
assert h not in previous_hashes, "Found collision on growing empty string"
|
||||
|
||||
|
||||
def test_uniform_distribution():
|
||||
n_bins, n_samples = 10, 100000
|
||||
bins = np.zeros(n_bins, dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
|
||||
|
||||
means = bins / n_samples
|
||||
expected = np.full(n_bins, 1.0 / n_bins)
|
||||
|
||||
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)
|
||||
@@ -0,0 +1,158 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.optimize import fmin_ncg
|
||||
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils.optimize import _newton_cg
|
||||
|
||||
|
||||
def test_newton_cg():
|
||||
# Test that newton_cg gives same result as scipy's fmin_ncg
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
A = rng.normal(size=(10, 10))
|
||||
x0 = np.ones(10)
|
||||
|
||||
def func(x):
|
||||
Ax = A.dot(x)
|
||||
return 0.5 * (Ax).dot(Ax)
|
||||
|
||||
def grad(x):
|
||||
return A.T.dot(A.dot(x))
|
||||
|
||||
def hess(x, p):
|
||||
return p.dot(A.T.dot(A.dot(x.all())))
|
||||
|
||||
def grad_hess(x):
|
||||
return grad(x), lambda x: A.T.dot(A.dot(x))
|
||||
|
||||
assert_array_almost_equal(
|
||||
_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0],
|
||||
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("verbose", [0, 1, 2])
|
||||
def test_newton_cg_verbosity(capsys, verbose):
|
||||
"""Test the std output of verbose newton_cg solver."""
|
||||
A = np.eye(2)
|
||||
b = np.array([1, 2], dtype=float)
|
||||
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.zeros(A.shape[0]),
|
||||
verbose=verbose,
|
||||
) # returns array([1., 2])
|
||||
captured = capsys.readouterr()
|
||||
|
||||
if verbose == 0:
|
||||
assert captured.out == ""
|
||||
else:
|
||||
msg = [
|
||||
"Newton-CG iter = 1",
|
||||
"Check Convergence",
|
||||
"max |gradient|",
|
||||
"Solver did converge at loss = ",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
if verbose >= 2:
|
||||
msg = [
|
||||
"Inner CG solver iteration 1 stopped with",
|
||||
"sum(|residuals|) <= tol",
|
||||
"Line Search",
|
||||
"try line search wolfe1",
|
||||
"wolfe1 line search was successful",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
if verbose >= 2:
|
||||
# Set up a badly scaled singular Hessian with a completely wrong starting
|
||||
# position. This should trigger 2nd line search check
|
||||
A = np.array([[1.0, 2], [2, 4]]) * 1e30 # collinear columns
|
||||
b = np.array([1.0, 2.0])
|
||||
# Note that scipy.optimize._linesearch LineSearchWarning inherits from
|
||||
# RuntimeWarning, but we do not want to import from non public APIs.
|
||||
with pytest.warns(RuntimeWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.array([-2.0, 1]), # null space of hessian
|
||||
verbose=verbose,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"wolfe1 line search was not successful",
|
||||
"check loss |improvement| <= eps * |loss_old|:",
|
||||
"check sum(|gradient|) < sum(|gradient_old|):",
|
||||
"last resort: try line search wolfe2",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
# Set up a badly conditioned Hessian that leads to tiny curvature.
|
||||
# X.T @ X have singular values array([1.00000400e+01, 1.00008192e-11])
|
||||
A = np.array([[1.0, 2], [1, 2 + 1e-15]])
|
||||
b = np.array([-2.0, 1])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=b,
|
||||
verbose=verbose,
|
||||
maxiter=2,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"tiny_|p| = eps * ||p||^2",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
# Test for a case with negative Hessian.
|
||||
# We do not trigger "Inner CG solver iteration {i} stopped with negative
|
||||
# curvature", but that is very hard to trigger.
|
||||
A = np.eye(2)
|
||||
b = np.array([-2.0, 1])
|
||||
with pytest.warns(RuntimeWarning):
|
||||
_newton_cg(
|
||||
# Note the wrong sign in the hessian product.
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: -A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.array([1.0, 1.0]),
|
||||
verbose=verbose,
|
||||
maxiter=3,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"Inner CG solver iteration 0 fell back to steepest descent",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
|
||||
A = np.diag([1e-3, 1, 1e3])
|
||||
b = np.array([-2.0, 1, 2.0])
|
||||
with pytest.warns(ConvergenceWarning):
|
||||
_newton_cg(
|
||||
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
|
||||
func=lambda x: 0.5 * x @ A @ x - b @ x,
|
||||
grad=lambda x: A @ x - b,
|
||||
x0=np.ones_like(b),
|
||||
verbose=verbose,
|
||||
maxiter=2,
|
||||
maxinner=1,
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
msg = [
|
||||
"Inner CG solver stopped reaching maxiter=1",
|
||||
]
|
||||
for m in msg:
|
||||
assert m in captured.out
|
||||
@@ -0,0 +1,100 @@
|
||||
import time
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn import config_context, get_config
|
||||
from sklearn.compose import make_column_transformer
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils.parallel import Parallel, delayed
|
||||
|
||||
|
||||
def get_working_memory():
|
||||
return get_config()["working_memory"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
|
||||
def test_configuration_passes_through_to_joblib(n_jobs, backend):
|
||||
# Tests that the global global configuration is passed to joblib jobs
|
||||
|
||||
with config_context(working_memory=123):
|
||||
results = Parallel(n_jobs=n_jobs, backend=backend)(
|
||||
delayed(get_working_memory)() for _ in range(2)
|
||||
)
|
||||
|
||||
assert_array_equal(results, [123] * 2)
|
||||
|
||||
|
||||
def test_parallel_delayed_warnings():
|
||||
"""Informative warnings should be raised when mixing sklearn and joblib API"""
|
||||
# We should issue a warning when one wants to use sklearn.utils.fixes.Parallel
|
||||
# with joblib.delayed. The config will not be propagated to the workers.
|
||||
warn_msg = "`sklearn.utils.parallel.Parallel` needs to be used in conjunction"
|
||||
with pytest.warns(UserWarning, match=warn_msg) as records:
|
||||
Parallel()(joblib.delayed(time.sleep)(0) for _ in range(10))
|
||||
assert len(records) == 10
|
||||
|
||||
# We should issue a warning if one wants to use sklearn.utils.fixes.delayed with
|
||||
# joblib.Parallel
|
||||
warn_msg = (
|
||||
"`sklearn.utils.parallel.delayed` should be used with "
|
||||
"`sklearn.utils.parallel.Parallel` to make it possible to propagate"
|
||||
)
|
||||
with pytest.warns(UserWarning, match=warn_msg) as records:
|
||||
joblib.Parallel()(delayed(time.sleep)(0) for _ in range(10))
|
||||
assert len(records) == 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_jobs", [1, 2])
|
||||
def test_dispatch_config_parallel(n_jobs):
|
||||
"""Check that we properly dispatch the configuration in parallel processing.
|
||||
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/issues/25239
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
iris = load_iris(as_frame=True)
|
||||
|
||||
class TransformerRequiredDataFrame(StandardScaler):
|
||||
def fit(self, X, y=None):
|
||||
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
|
||||
return super().fit(X, y)
|
||||
|
||||
def transform(self, X, y=None):
|
||||
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
|
||||
return super().transform(X, y)
|
||||
|
||||
dropper = make_column_transformer(
|
||||
("drop", [0]),
|
||||
remainder="passthrough",
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
param_grid = {"randomforestclassifier__max_depth": [1, 2, 3]}
|
||||
search_cv = GridSearchCV(
|
||||
make_pipeline(
|
||||
dropper,
|
||||
TransformerRequiredDataFrame(),
|
||||
RandomForestClassifier(n_estimators=5, n_jobs=n_jobs),
|
||||
),
|
||||
param_grid,
|
||||
cv=5,
|
||||
n_jobs=n_jobs,
|
||||
error_score="raise", # this search should not fail
|
||||
)
|
||||
|
||||
# make sure that `fit` would fail in case we don't request dataframe
|
||||
with pytest.raises(AssertionError, match="X should be a DataFrame"):
|
||||
search_cv.fit(iris.data, iris.target)
|
||||
|
||||
with config_context(transform_output="pandas"):
|
||||
# we expect each intermediate steps to output a DataFrame
|
||||
search_cv.fit(iris.data, iris.target)
|
||||
|
||||
assert not np.isnan(search_cv.cv_results_["mean_test_score"]).any()
|
||||
@@ -0,0 +1,785 @@
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
from sklearn._config import config_context, get_config
|
||||
from sklearn.base import BaseEstimator, _fit_context
|
||||
from sklearn.model_selection import LeaveOneOut
|
||||
from sklearn.utils import deprecated
|
||||
from sklearn.utils._param_validation import (
|
||||
HasMethods,
|
||||
Hidden,
|
||||
Interval,
|
||||
InvalidParameterError,
|
||||
MissingValues,
|
||||
Options,
|
||||
RealNotInt,
|
||||
StrOptions,
|
||||
_ArrayLikes,
|
||||
_Booleans,
|
||||
_Callables,
|
||||
_CVObjects,
|
||||
_InstancesOf,
|
||||
_IterablesNotString,
|
||||
_NanConstraint,
|
||||
_NoneConstraint,
|
||||
_PandasNAConstraint,
|
||||
_RandomStates,
|
||||
_SparseMatrices,
|
||||
_VerboseHelper,
|
||||
generate_invalid_param_val,
|
||||
generate_valid_param,
|
||||
make_constraint,
|
||||
validate_params,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
# Some helpers for the tests
|
||||
@validate_params(
|
||||
{"a": [Real], "b": [Real], "c": [Real], "d": [Real]},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def _func(a, b=0, *args, c, d=0, **kwargs):
|
||||
"""A function to test the validation of functions."""
|
||||
|
||||
|
||||
class _Class:
|
||||
"""A class to test the _InstancesOf constraint and the validation of methods."""
|
||||
|
||||
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
|
||||
def _method(self, a):
|
||||
"""A validated method"""
|
||||
|
||||
@deprecated()
|
||||
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
|
||||
def _deprecated_method(self, a):
|
||||
"""A deprecated validated method"""
|
||||
|
||||
|
||||
class _Estimator(BaseEstimator):
|
||||
"""An estimator to test the validation of estimator parameters."""
|
||||
|
||||
_parameter_constraints: dict = {"a": [Real]}
|
||||
|
||||
def __init__(self, a):
|
||||
self.a = a
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X=None, y=None):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interval_type", [Integral, Real])
|
||||
def test_interval_range(interval_type):
|
||||
"""Check the range of values depending on closed."""
|
||||
interval = Interval(interval_type, -2, 2, closed="left")
|
||||
assert -2 in interval
|
||||
assert 2 not in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="right")
|
||||
assert -2 not in interval
|
||||
assert 2 in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="both")
|
||||
assert -2 in interval
|
||||
assert 2 in interval
|
||||
|
||||
interval = Interval(interval_type, -2, 2, closed="neither")
|
||||
assert -2 not in interval
|
||||
assert 2 not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interval_type", [Integral, Real])
|
||||
def test_interval_large_integers(interval_type):
|
||||
"""Check that Interval constraint work with large integers.
|
||||
|
||||
non-regression test for #26648.
|
||||
"""
|
||||
interval = Interval(interval_type, 0, 2, closed="neither")
|
||||
assert 2**65 not in interval
|
||||
assert 2**128 not in interval
|
||||
assert float(2**65) not in interval
|
||||
assert float(2**128) not in interval
|
||||
|
||||
interval = Interval(interval_type, 0, 2**128, closed="neither")
|
||||
assert 2**65 in interval
|
||||
assert 2**128 not in interval
|
||||
assert float(2**65) in interval
|
||||
assert float(2**128) not in interval
|
||||
|
||||
assert 2**1024 not in interval
|
||||
|
||||
|
||||
def test_interval_inf_in_bounds():
|
||||
"""Check that inf is included iff a bound is closed and set to None.
|
||||
|
||||
Only valid for real intervals.
|
||||
"""
|
||||
interval = Interval(Real, 0, None, closed="right")
|
||||
assert np.inf in interval
|
||||
|
||||
interval = Interval(Real, None, 0, closed="left")
|
||||
assert -np.inf in interval
|
||||
|
||||
interval = Interval(Real, None, None, closed="neither")
|
||||
assert np.inf not in interval
|
||||
assert -np.inf not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interval",
|
||||
[Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")],
|
||||
)
|
||||
def test_nan_not_in_interval(interval):
|
||||
"""Check that np.nan is not in any interval."""
|
||||
assert np.nan not in interval
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"params, error, match",
|
||||
[
|
||||
(
|
||||
{"type": Integral, "left": 1.0, "right": 2, "closed": "both"},
|
||||
TypeError,
|
||||
r"Expecting left to be an int for an interval over the integers",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 1, "right": 2.0, "closed": "neither"},
|
||||
TypeError,
|
||||
"Expecting right to be an int for an interval over the integers",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": None, "right": 0, "closed": "left"},
|
||||
ValueError,
|
||||
r"left can't be None when closed == left",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 0, "right": None, "closed": "right"},
|
||||
ValueError,
|
||||
r"right can't be None when closed == right",
|
||||
),
|
||||
(
|
||||
{"type": Integral, "left": 1, "right": -1, "closed": "both"},
|
||||
ValueError,
|
||||
r"right can't be less than left",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_interval_errors(params, error, match):
|
||||
"""Check that informative errors are raised for invalid combination of parameters"""
|
||||
with pytest.raises(error, match=match):
|
||||
Interval(**params)
|
||||
|
||||
|
||||
def test_stroptions():
|
||||
"""Sanity check for the StrOptions constraint"""
|
||||
options = StrOptions({"a", "b", "c"}, deprecated={"c"})
|
||||
assert options.is_satisfied_by("a")
|
||||
assert options.is_satisfied_by("c")
|
||||
assert not options.is_satisfied_by("d")
|
||||
|
||||
assert "'c' (deprecated)" in str(options)
|
||||
|
||||
|
||||
def test_options():
|
||||
"""Sanity check for the Options constraint"""
|
||||
options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5})
|
||||
assert options.is_satisfied_by(-0.5)
|
||||
assert options.is_satisfied_by(np.inf)
|
||||
assert not options.is_satisfied_by(1.23)
|
||||
|
||||
assert "-0.5 (deprecated)" in str(options)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"type, expected_type_name",
|
||||
[
|
||||
(int, "int"),
|
||||
(Integral, "int"),
|
||||
(Real, "float"),
|
||||
(np.ndarray, "numpy.ndarray"),
|
||||
],
|
||||
)
|
||||
def test_instances_of_type_human_readable(type, expected_type_name):
|
||||
"""Check the string representation of the _InstancesOf constraint."""
|
||||
constraint = _InstancesOf(type)
|
||||
assert str(constraint) == f"an instance of '{expected_type_name}'"
|
||||
|
||||
|
||||
def test_hasmethods():
|
||||
"""Check the HasMethods constraint."""
|
||||
constraint = HasMethods(["a", "b"])
|
||||
|
||||
class _Good:
|
||||
def a(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
def b(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
class _Bad:
|
||||
def a(self):
|
||||
pass # pragma: no cover
|
||||
|
||||
assert constraint.is_satisfied_by(_Good())
|
||||
assert not constraint.is_satisfied_by(_Bad())
|
||||
assert str(constraint) == "an object implementing 'a' and 'b'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
Interval(Real, None, 0, closed="left"),
|
||||
Interval(Real, 0, None, closed="left"),
|
||||
Interval(Real, None, None, closed="neither"),
|
||||
StrOptions({"a", "b", "c"}),
|
||||
MissingValues(),
|
||||
MissingValues(numeric_only=True),
|
||||
_VerboseHelper(),
|
||||
HasMethods("fit"),
|
||||
_IterablesNotString(),
|
||||
_CVObjects(),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val(constraint):
|
||||
"""Check that the value generated does not satisfy the constraint"""
|
||||
bad_value = generate_invalid_param_val(constraint)
|
||||
assert not constraint.is_satisfied_by(bad_value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"integer_interval, real_interval",
|
||||
[
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, -5, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, -5, 5, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 4, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 5, None, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, None, 3, closed="right"),
|
||||
Interval(RealNotInt, 4, None, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, -5, 5, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, -5, 5, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, 1, 2, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, None, -5, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 3, None, closed="left"),
|
||||
Interval(RealNotInt, None, -4, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, None, 1, closed="right"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, 1, None, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, -10, -4, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="both"),
|
||||
Interval(RealNotInt, -10, -4, closed="right"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="neither"),
|
||||
Interval(RealNotInt, 6, 10, closed="neither"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, -5, 5, closed="neither"),
|
||||
Interval(RealNotInt, 6, 10, closed="left"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 2, None, closed="left"),
|
||||
Interval(RealNotInt, 0, 1, closed="both"),
|
||||
),
|
||||
(
|
||||
Interval(Integral, 1, None, closed="left"),
|
||||
Interval(RealNotInt, 0, 1, closed="both"),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval):
|
||||
"""Check that the value generated for an interval constraint does not satisfy any of
|
||||
the interval constraints.
|
||||
"""
|
||||
bad_value = generate_invalid_param_val(constraint=real_interval)
|
||||
assert not real_interval.is_satisfied_by(bad_value)
|
||||
assert not integer_interval.is_satisfied_by(bad_value)
|
||||
|
||||
bad_value = generate_invalid_param_val(constraint=integer_interval)
|
||||
assert not real_interval.is_satisfied_by(bad_value)
|
||||
assert not integer_interval.is_satisfied_by(bad_value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
_ArrayLikes(),
|
||||
_InstancesOf(list),
|
||||
_Callables(),
|
||||
_NoneConstraint(),
|
||||
_RandomStates(),
|
||||
_SparseMatrices(),
|
||||
_Booleans(),
|
||||
Interval(Integral, None, None, closed="neither"),
|
||||
],
|
||||
)
|
||||
def test_generate_invalid_param_val_all_valid(constraint):
|
||||
"""Check that the function raises NotImplementedError when there's no invalid value
|
||||
for the constraint.
|
||||
"""
|
||||
with pytest.raises(NotImplementedError):
|
||||
generate_invalid_param_val(constraint)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint",
|
||||
[
|
||||
_ArrayLikes(),
|
||||
_Callables(),
|
||||
_InstancesOf(list),
|
||||
_NoneConstraint(),
|
||||
_RandomStates(),
|
||||
_SparseMatrices(),
|
||||
_Booleans(),
|
||||
_VerboseHelper(),
|
||||
MissingValues(),
|
||||
MissingValues(numeric_only=True),
|
||||
StrOptions({"a", "b", "c"}),
|
||||
Options(Integral, {1, 2, 3}),
|
||||
Interval(Integral, None, None, closed="neither"),
|
||||
Interval(Integral, 0, 10, closed="neither"),
|
||||
Interval(Integral, 0, None, closed="neither"),
|
||||
Interval(Integral, None, 0, closed="neither"),
|
||||
Interval(Real, 0, 1, closed="neither"),
|
||||
Interval(Real, 0, None, closed="both"),
|
||||
Interval(Real, None, 0, closed="right"),
|
||||
HasMethods("fit"),
|
||||
_IterablesNotString(),
|
||||
_CVObjects(),
|
||||
],
|
||||
)
|
||||
def test_generate_valid_param(constraint):
|
||||
"""Check that the value generated does satisfy the constraint."""
|
||||
value = generate_valid_param(constraint)
|
||||
assert constraint.is_satisfied_by(value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint_declaration, value",
|
||||
[
|
||||
(Interval(Real, 0, 1, closed="both"), 0.42),
|
||||
(Interval(Integral, 0, None, closed="neither"), 42),
|
||||
(StrOptions({"a", "b", "c"}), "b"),
|
||||
(Options(type, {np.float32, np.float64}), np.float64),
|
||||
(callable, lambda x: x + 1),
|
||||
(None, None),
|
||||
("array-like", [[1, 2], [3, 4]]),
|
||||
("array-like", np.array([[1, 2], [3, 4]])),
|
||||
("sparse matrix", csr_matrix([[1, 2], [3, 4]])),
|
||||
*[
|
||||
("sparse matrix", container([[1, 2], [3, 4]]))
|
||||
for container in CSR_CONTAINERS
|
||||
],
|
||||
("random_state", 0),
|
||||
("random_state", np.random.RandomState(0)),
|
||||
("random_state", None),
|
||||
(_Class, _Class()),
|
||||
(int, 1),
|
||||
(Real, 0.5),
|
||||
("boolean", False),
|
||||
("verbose", 1),
|
||||
("nan", np.nan),
|
||||
(MissingValues(), -1),
|
||||
(MissingValues(), -1.0),
|
||||
(MissingValues(), 2**1028),
|
||||
(MissingValues(), None),
|
||||
(MissingValues(), float("nan")),
|
||||
(MissingValues(), np.nan),
|
||||
(MissingValues(), "missing"),
|
||||
(HasMethods("fit"), _Estimator(a=0)),
|
||||
("cv_object", 5),
|
||||
],
|
||||
)
|
||||
def test_is_satisfied_by(constraint_declaration, value):
|
||||
"""Sanity check for the is_satisfied_by method"""
|
||||
constraint = make_constraint(constraint_declaration)
|
||||
assert constraint.is_satisfied_by(value)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constraint_declaration, expected_constraint_class",
|
||||
[
|
||||
(Interval(Real, 0, 1, closed="both"), Interval),
|
||||
(StrOptions({"option1", "option2"}), StrOptions),
|
||||
(Options(Real, {0.42, 1.23}), Options),
|
||||
("array-like", _ArrayLikes),
|
||||
("sparse matrix", _SparseMatrices),
|
||||
("random_state", _RandomStates),
|
||||
(None, _NoneConstraint),
|
||||
(callable, _Callables),
|
||||
(int, _InstancesOf),
|
||||
("boolean", _Booleans),
|
||||
("verbose", _VerboseHelper),
|
||||
(MissingValues(numeric_only=True), MissingValues),
|
||||
(HasMethods("fit"), HasMethods),
|
||||
("cv_object", _CVObjects),
|
||||
("nan", _NanConstraint),
|
||||
],
|
||||
)
|
||||
def test_make_constraint(constraint_declaration, expected_constraint_class):
|
||||
"""Check that make_constraint dispatches to the appropriate constraint class"""
|
||||
constraint = make_constraint(constraint_declaration)
|
||||
assert constraint.__class__ is expected_constraint_class
|
||||
|
||||
|
||||
def test_make_constraint_unknown():
|
||||
"""Check that an informative error is raised when an unknown constraint is passed"""
|
||||
with pytest.raises(ValueError, match="Unknown constraint"):
|
||||
make_constraint("not a valid constraint")
|
||||
|
||||
|
||||
def test_validate_params():
|
||||
"""Check that validate_params works no matter how the arguments are passed"""
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _func must be"
|
||||
):
|
||||
_func("wrong", c=1)
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'b' parameter of _func must be"
|
||||
):
|
||||
_func(*[1, "wrong"], c=1)
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'c' parameter of _func must be"
|
||||
):
|
||||
_func(1, **{"c": "wrong"})
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'd' parameter of _func must be"
|
||||
):
|
||||
_func(1, c=1, d="wrong")
|
||||
|
||||
# check in the presence of extra positional and keyword args
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'b' parameter of _func must be"
|
||||
):
|
||||
_func(0, *["wrong", 2, 3], c=4, **{"e": 5})
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'c' parameter of _func must be"
|
||||
):
|
||||
_func(0, *[1, 2, 3], c="four", **{"e": 5})
|
||||
|
||||
|
||||
def test_validate_params_missing_params():
|
||||
"""Check that no error is raised when there are parameters without
|
||||
constraints
|
||||
"""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def func(a, b):
|
||||
pass
|
||||
|
||||
func(1, 2)
|
||||
|
||||
|
||||
def test_decorate_validated_function():
|
||||
"""Check that validate_params functions can be decorated"""
|
||||
decorated_function = deprecated()(_func)
|
||||
|
||||
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
|
||||
decorated_function(1, 2, c=3)
|
||||
|
||||
# outer decorator does not interfere with validation
|
||||
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match=r"The 'c' parameter of _func must be"
|
||||
):
|
||||
decorated_function(1, 2, c="wrong")
|
||||
|
||||
|
||||
def test_validate_params_method():
|
||||
"""Check that validate_params works with methods"""
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _Class._method must be"
|
||||
):
|
||||
_Class()._method("wrong")
|
||||
|
||||
# validated method can be decorated
|
||||
with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"):
|
||||
with pytest.raises(
|
||||
InvalidParameterError,
|
||||
match="The 'a' parameter of _Class._deprecated_method must be",
|
||||
):
|
||||
_Class()._deprecated_method("wrong")
|
||||
|
||||
|
||||
def test_validate_params_estimator():
|
||||
"""Check that validate_params works with Estimator instances"""
|
||||
# no validation in init
|
||||
est = _Estimator("wrong")
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'a' parameter of _Estimator must be"
|
||||
):
|
||||
est.fit()
|
||||
|
||||
|
||||
def test_stroptions_deprecated_subset():
|
||||
"""Check that the deprecated parameter must be a subset of options."""
|
||||
with pytest.raises(ValueError, match="deprecated options must be a subset"):
|
||||
StrOptions({"a", "b", "c"}, deprecated={"a", "d"})
|
||||
|
||||
|
||||
def test_hidden_constraint():
|
||||
"""Check that internal constraints are not exposed in the error message."""
|
||||
|
||||
@validate_params(
|
||||
{"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True
|
||||
)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# list and dict are valid params
|
||||
f({"a": 1, "b": 2, "c": 3})
|
||||
f([1, 2, 3])
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'param' parameter"
|
||||
) as exc_info:
|
||||
f(param="bad")
|
||||
|
||||
# the list option is not exposed in the error message
|
||||
err_msg = str(exc_info.value)
|
||||
assert "an instance of 'dict'" in err_msg
|
||||
assert "an instance of 'list'" not in err_msg
|
||||
|
||||
|
||||
def test_hidden_stroptions():
|
||||
"""Check that we can have 2 StrOptions constraints, one being hidden."""
|
||||
|
||||
@validate_params(
|
||||
{"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# "auto" and "warn" are valid params
|
||||
f("auto")
|
||||
f("warn")
|
||||
|
||||
with pytest.raises(
|
||||
InvalidParameterError, match="The 'param' parameter"
|
||||
) as exc_info:
|
||||
f(param="bad")
|
||||
|
||||
# the "warn" option is not exposed in the error message
|
||||
err_msg = str(exc_info.value)
|
||||
assert "auto" in err_msg
|
||||
assert "warn" not in err_msg
|
||||
|
||||
|
||||
def test_validate_params_set_param_constraints_attribute():
|
||||
"""Check that the validate_params decorator properly sets the parameter constraints
|
||||
as attribute of the decorated function/method.
|
||||
"""
|
||||
assert hasattr(_func, "_skl_parameter_constraints")
|
||||
assert hasattr(_Class()._method, "_skl_parameter_constraints")
|
||||
|
||||
|
||||
def test_boolean_constraint_deprecated_int():
|
||||
"""Check that validate_params raise a deprecation message but still passes
|
||||
validation when using an int for a parameter accepting a boolean.
|
||||
"""
|
||||
|
||||
@validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True)
|
||||
def f(param):
|
||||
pass
|
||||
|
||||
# True/False and np.bool_(True/False) are valid params
|
||||
f(True)
|
||||
f(np.bool_(False))
|
||||
|
||||
|
||||
def test_no_validation():
|
||||
"""Check that validation can be skipped for a parameter."""
|
||||
|
||||
@validate_params(
|
||||
{"param1": [int, None], "param2": "no_validation"},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def f(param1=None, param2=None):
|
||||
pass
|
||||
|
||||
# param1 is validated
|
||||
with pytest.raises(InvalidParameterError, match="The 'param1' parameter"):
|
||||
f(param1="wrong")
|
||||
|
||||
# param2 is not validated: any type is valid.
|
||||
class SomeType:
|
||||
pass
|
||||
|
||||
f(param2=SomeType)
|
||||
f(param2=SomeType())
|
||||
|
||||
|
||||
def test_pandas_na_constraint_with_pd_na():
|
||||
"""Add a specific test for checking support for `pandas.NA`."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
na_constraint = _PandasNAConstraint()
|
||||
assert na_constraint.is_satisfied_by(pd.NA)
|
||||
assert not na_constraint.is_satisfied_by(np.array([1, 2, 3]))
|
||||
|
||||
|
||||
def test_iterable_not_string():
|
||||
"""Check that a string does not satisfy the _IterableNotString constraint."""
|
||||
constraint = _IterablesNotString()
|
||||
assert constraint.is_satisfied_by([1, 2, 3])
|
||||
assert constraint.is_satisfied_by(range(10))
|
||||
assert not constraint.is_satisfied_by("some string")
|
||||
|
||||
|
||||
def test_cv_objects():
|
||||
"""Check that the _CVObjects constraint accepts all current ways
|
||||
to pass cv objects."""
|
||||
constraint = _CVObjects()
|
||||
assert constraint.is_satisfied_by(5)
|
||||
assert constraint.is_satisfied_by(LeaveOneOut())
|
||||
assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
|
||||
assert constraint.is_satisfied_by(None)
|
||||
assert not constraint.is_satisfied_by("not a CV object")
|
||||
|
||||
|
||||
def test_third_party_estimator():
|
||||
"""Check that the validation from a scikit-learn estimator inherited by a third
|
||||
party estimator does not impose a match between the dict of constraints and the
|
||||
parameters of the estimator.
|
||||
"""
|
||||
|
||||
class ThirdPartyEstimator(_Estimator):
|
||||
def __init__(self, b):
|
||||
self.b = b
|
||||
super().__init__(a=0)
|
||||
|
||||
def fit(self, X=None, y=None):
|
||||
super().fit(X, y)
|
||||
|
||||
# does not raise, even though "b" is not in the constraints dict and "a" is not
|
||||
# a parameter of the estimator.
|
||||
ThirdPartyEstimator(b=0).fit()
|
||||
|
||||
|
||||
def test_interval_real_not_int():
|
||||
"""Check for the type RealNotInt in the Interval constraint."""
|
||||
constraint = Interval(RealNotInt, 0, 1, closed="both")
|
||||
assert constraint.is_satisfied_by(1.0)
|
||||
assert not constraint.is_satisfied_by(1)
|
||||
|
||||
|
||||
def test_real_not_int():
|
||||
"""Check for the RealNotInt type."""
|
||||
assert isinstance(1.0, RealNotInt)
|
||||
assert not isinstance(1, RealNotInt)
|
||||
assert isinstance(np.float64(1), RealNotInt)
|
||||
assert not isinstance(np.int64(1), RealNotInt)
|
||||
|
||||
|
||||
def test_skip_param_validation():
|
||||
"""Check that param validation can be skipped using config_context."""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def f(a):
|
||||
pass
|
||||
|
||||
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
|
||||
f(a="1")
|
||||
|
||||
# does not raise
|
||||
with config_context(skip_parameter_validation=True):
|
||||
f(a="1")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("prefer_skip_nested_validation", [True, False])
|
||||
def test_skip_nested_validation(prefer_skip_nested_validation):
|
||||
"""Check that nested validation can be skipped."""
|
||||
|
||||
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
|
||||
def f(a):
|
||||
pass
|
||||
|
||||
@validate_params(
|
||||
{"b": [int]},
|
||||
prefer_skip_nested_validation=prefer_skip_nested_validation,
|
||||
)
|
||||
def g(b):
|
||||
# calls f with a bad parameter type
|
||||
return f(a="invalid_param_value")
|
||||
|
||||
# Validation for g is never skipped.
|
||||
with pytest.raises(InvalidParameterError, match="The 'b' parameter"):
|
||||
g(b="invalid_param_value")
|
||||
|
||||
if prefer_skip_nested_validation:
|
||||
g(b=1) # does not raise because inner f is not validated
|
||||
else:
|
||||
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
|
||||
g(b=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"skip_parameter_validation, prefer_skip_nested_validation, expected_skipped",
|
||||
[
|
||||
(True, True, True),
|
||||
(True, False, True),
|
||||
(False, True, True),
|
||||
(False, False, False),
|
||||
],
|
||||
)
|
||||
def test_skip_nested_validation_and_config_context(
|
||||
skip_parameter_validation, prefer_skip_nested_validation, expected_skipped
|
||||
):
|
||||
"""Check interaction between global skip and local skip."""
|
||||
|
||||
@validate_params(
|
||||
{"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation
|
||||
)
|
||||
def g(a):
|
||||
return get_config()["skip_parameter_validation"]
|
||||
|
||||
with config_context(skip_parameter_validation=skip_parameter_validation):
|
||||
actual_skipped = g(1)
|
||||
|
||||
assert actual_skipped == expected_skipped
|
||||
@@ -0,0 +1,63 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._plotting import _interval_max_min_ratio, _validate_score_name
|
||||
|
||||
|
||||
def metric():
|
||||
pass # pragma: no cover
|
||||
|
||||
|
||||
def neg_metric():
|
||||
pass # pragma: no cover
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"score_name, scoring, negate_score, expected_score_name",
|
||||
[
|
||||
("accuracy", None, False, "accuracy"), # do not transform the name
|
||||
(None, "accuracy", False, "Accuracy"), # capitalize the name
|
||||
(None, "accuracy", True, "Negative accuracy"), # add "Negative"
|
||||
(None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
|
||||
(None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_"
|
||||
("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name
|
||||
(None, None, False, "Score"), # default name
|
||||
(None, None, True, "Negative score"), # default name but negated
|
||||
("Some metric", metric, False, "Some metric"), # do not transform the name
|
||||
("Some metric", metric, True, "Some metric"), # do not transform the name
|
||||
(None, metric, False, "Metric"), # default name
|
||||
(None, metric, True, "Negative metric"), # default name but negated
|
||||
("Some metric", neg_metric, False, "Some metric"), # do not transform the name
|
||||
("Some metric", neg_metric, True, "Some metric"), # do not transform the name
|
||||
(None, neg_metric, False, "Negative metric"), # default name
|
||||
(None, neg_metric, True, "Metric"), # default name but negated
|
||||
],
|
||||
)
|
||||
def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
|
||||
"""Check that we return the right score name."""
|
||||
assert (
|
||||
_validate_score_name(score_name, scoring, negate_score) == expected_score_name
|
||||
)
|
||||
|
||||
|
||||
# In the following test, we check the value of the max to min ratio
|
||||
# for parameter value intervals to check that using a decision threshold
|
||||
# of 5. is a good heuristic to decide between linear and log scales on
|
||||
# common ranges of parameter values.
|
||||
@pytest.mark.parametrize(
|
||||
"data, lower_bound, upper_bound",
|
||||
[
|
||||
# Such a range could be clearly displayed with either log scale or linear
|
||||
# scale.
|
||||
(np.geomspace(0.1, 1, 5), 5, 6),
|
||||
# Checking that the ratio is still positive on a negative log scale.
|
||||
(-np.geomspace(0.1, 1, 10), 7, 8),
|
||||
# Evenly spaced parameter values lead to a ratio of 1.
|
||||
(np.linspace(0, 1, 5), 0.9, 1.1),
|
||||
# This is not exactly spaced on a log scale but we will benefit from treating
|
||||
# it as such for visualization.
|
||||
([1, 2, 5, 10, 20, 50], 20, 40),
|
||||
],
|
||||
)
|
||||
def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
|
||||
assert lower_bound < _interval_max_min_ratio(data) < upper_bound
|
||||
@@ -0,0 +1,680 @@
|
||||
import re
|
||||
from pprint import PrettyPrinter
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._pprint import _EstimatorPrettyPrinter
|
||||
from sklearn.linear_model import LogisticRegressionCV
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.base import BaseEstimator, TransformerMixin
|
||||
from sklearn.feature_selection import SelectKBest, chi2
|
||||
from sklearn import config_context
|
||||
|
||||
|
||||
# Ignore flake8 (lots of line too long issues)
|
||||
# ruff: noqa
|
||||
|
||||
|
||||
# Constructors excerpted to test pprinting
|
||||
class LogisticRegression(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
penalty="l2",
|
||||
dual=False,
|
||||
tol=1e-4,
|
||||
C=1.0,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
class_weight=None,
|
||||
random_state=None,
|
||||
solver="warn",
|
||||
max_iter=100,
|
||||
multi_class="warn",
|
||||
verbose=0,
|
||||
warm_start=False,
|
||||
n_jobs=None,
|
||||
l1_ratio=None,
|
||||
):
|
||||
self.penalty = penalty
|
||||
self.dual = dual
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.fit_intercept = fit_intercept
|
||||
self.intercept_scaling = intercept_scaling
|
||||
self.class_weight = class_weight
|
||||
self.random_state = random_state
|
||||
self.solver = solver
|
||||
self.max_iter = max_iter
|
||||
self.multi_class = multi_class
|
||||
self.verbose = verbose
|
||||
self.warm_start = warm_start
|
||||
self.n_jobs = n_jobs
|
||||
self.l1_ratio = l1_ratio
|
||||
|
||||
def fit(self, X, y):
|
||||
return self
|
||||
|
||||
|
||||
class StandardScaler(TransformerMixin, BaseEstimator):
|
||||
def __init__(self, copy=True, with_mean=True, with_std=True):
|
||||
self.with_mean = with_mean
|
||||
self.with_std = with_std
|
||||
self.copy = copy
|
||||
|
||||
def transform(self, X, copy=None):
|
||||
return self
|
||||
|
||||
|
||||
class RFE(BaseEstimator):
|
||||
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
|
||||
self.estimator = estimator
|
||||
self.n_features_to_select = n_features_to_select
|
||||
self.step = step
|
||||
self.verbose = verbose
|
||||
|
||||
|
||||
class GridSearchCV(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
estimator,
|
||||
param_grid,
|
||||
scoring=None,
|
||||
n_jobs=None,
|
||||
iid="warn",
|
||||
refit=True,
|
||||
cv="warn",
|
||||
verbose=0,
|
||||
pre_dispatch="2*n_jobs",
|
||||
error_score="raise-deprecating",
|
||||
return_train_score=False,
|
||||
):
|
||||
self.estimator = estimator
|
||||
self.param_grid = param_grid
|
||||
self.scoring = scoring
|
||||
self.n_jobs = n_jobs
|
||||
self.iid = iid
|
||||
self.refit = refit
|
||||
self.cv = cv
|
||||
self.verbose = verbose
|
||||
self.pre_dispatch = pre_dispatch
|
||||
self.error_score = error_score
|
||||
self.return_train_score = return_train_score
|
||||
|
||||
|
||||
class CountVectorizer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
input="content",
|
||||
encoding="utf-8",
|
||||
decode_error="strict",
|
||||
strip_accents=None,
|
||||
lowercase=True,
|
||||
preprocessor=None,
|
||||
tokenizer=None,
|
||||
stop_words=None,
|
||||
token_pattern=r"(?u)\b\w\w+\b",
|
||||
ngram_range=(1, 1),
|
||||
analyzer="word",
|
||||
max_df=1.0,
|
||||
min_df=1,
|
||||
max_features=None,
|
||||
vocabulary=None,
|
||||
binary=False,
|
||||
dtype=np.int64,
|
||||
):
|
||||
self.input = input
|
||||
self.encoding = encoding
|
||||
self.decode_error = decode_error
|
||||
self.strip_accents = strip_accents
|
||||
self.preprocessor = preprocessor
|
||||
self.tokenizer = tokenizer
|
||||
self.analyzer = analyzer
|
||||
self.lowercase = lowercase
|
||||
self.token_pattern = token_pattern
|
||||
self.stop_words = stop_words
|
||||
self.max_df = max_df
|
||||
self.min_df = min_df
|
||||
self.max_features = max_features
|
||||
self.ngram_range = ngram_range
|
||||
self.vocabulary = vocabulary
|
||||
self.binary = binary
|
||||
self.dtype = dtype
|
||||
|
||||
|
||||
class Pipeline(BaseEstimator):
|
||||
def __init__(self, steps, memory=None):
|
||||
self.steps = steps
|
||||
self.memory = memory
|
||||
|
||||
|
||||
class SVC(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
C=1.0,
|
||||
kernel="rbf",
|
||||
degree=3,
|
||||
gamma="auto_deprecated",
|
||||
coef0=0.0,
|
||||
shrinking=True,
|
||||
probability=False,
|
||||
tol=1e-3,
|
||||
cache_size=200,
|
||||
class_weight=None,
|
||||
verbose=False,
|
||||
max_iter=-1,
|
||||
decision_function_shape="ovr",
|
||||
random_state=None,
|
||||
):
|
||||
self.kernel = kernel
|
||||
self.degree = degree
|
||||
self.gamma = gamma
|
||||
self.coef0 = coef0
|
||||
self.tol = tol
|
||||
self.C = C
|
||||
self.shrinking = shrinking
|
||||
self.probability = probability
|
||||
self.cache_size = cache_size
|
||||
self.class_weight = class_weight
|
||||
self.verbose = verbose
|
||||
self.max_iter = max_iter
|
||||
self.decision_function_shape = decision_function_shape
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class PCA(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
copy=True,
|
||||
whiten=False,
|
||||
svd_solver="auto",
|
||||
tol=0.0,
|
||||
iterated_power="auto",
|
||||
random_state=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.copy = copy
|
||||
self.whiten = whiten
|
||||
self.svd_solver = svd_solver
|
||||
self.tol = tol
|
||||
self.iterated_power = iterated_power
|
||||
self.random_state = random_state
|
||||
|
||||
|
||||
class NMF(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
n_components=None,
|
||||
init=None,
|
||||
solver="cd",
|
||||
beta_loss="frobenius",
|
||||
tol=1e-4,
|
||||
max_iter=200,
|
||||
random_state=None,
|
||||
alpha=0.0,
|
||||
l1_ratio=0.0,
|
||||
verbose=0,
|
||||
shuffle=False,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.solver = solver
|
||||
self.beta_loss = beta_loss
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.random_state = random_state
|
||||
self.alpha = alpha
|
||||
self.l1_ratio = l1_ratio
|
||||
self.verbose = verbose
|
||||
self.shuffle = shuffle
|
||||
|
||||
|
||||
class SimpleImputer(BaseEstimator):
|
||||
def __init__(
|
||||
self,
|
||||
missing_values=np.nan,
|
||||
strategy="mean",
|
||||
fill_value=None,
|
||||
verbose=0,
|
||||
copy=True,
|
||||
):
|
||||
self.missing_values = missing_values
|
||||
self.strategy = strategy
|
||||
self.fill_value = fill_value
|
||||
self.verbose = verbose
|
||||
self.copy = copy
|
||||
|
||||
|
||||
def test_basic(print_changed_only_false):
|
||||
# Basic pprint test
|
||||
lr = LogisticRegression()
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
|
||||
def test_changed_only():
|
||||
# Make sure the changed_only param is correctly used when True (default)
|
||||
lr = LogisticRegression(C=99)
|
||||
expected = """LogisticRegression(C=99)"""
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
# Check with a repr that doesn't fit on a single line
|
||||
lr = LogisticRegression(
|
||||
C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
|
||||
)
|
||||
expected = """
|
||||
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
|
||||
verbose=True)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert lr.__repr__() == expected
|
||||
|
||||
imputer = SimpleImputer(missing_values=0)
|
||||
expected = """SimpleImputer(missing_values=0)"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# Defaults to np.nan, trying with float('NaN')
|
||||
imputer = SimpleImputer(missing_values=float("NaN"))
|
||||
expected = """SimpleImputer()"""
|
||||
assert imputer.__repr__() == expected
|
||||
|
||||
# make sure array parameters don't throw error (see #13583)
|
||||
repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
|
||||
|
||||
|
||||
def test_pipeline(print_changed_only_false):
|
||||
# Render a pipeline object
|
||||
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
|
||||
expected = """
|
||||
Pipeline(memory=None,
|
||||
steps=[('standardscaler',
|
||||
StandardScaler(copy=True, with_mean=True, with_std=True)),
|
||||
('logisticregression',
|
||||
LogisticRegression(C=999, class_weight=None, dual=False,
|
||||
fit_intercept=True, intercept_scaling=1,
|
||||
l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None,
|
||||
penalty='l2', random_state=None,
|
||||
solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False))],
|
||||
verbose=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pipeline.__repr__() == expected
|
||||
|
||||
|
||||
def test_deeply_nested(print_changed_only_false):
|
||||
# Render a deeply nested estimator
|
||||
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
|
||||
expected = """
|
||||
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
|
||||
class_weight=None,
|
||||
dual=False,
|
||||
fit_intercept=True,
|
||||
intercept_scaling=1,
|
||||
l1_ratio=None,
|
||||
max_iter=100,
|
||||
multi_class='warn',
|
||||
n_jobs=None,
|
||||
penalty='l2',
|
||||
random_state=None,
|
||||
solver='warn',
|
||||
tol=0.0001,
|
||||
verbose=0,
|
||||
warm_start=False),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None,
|
||||
step=1, verbose=0),
|
||||
n_features_to_select=None, step=1,
|
||||
verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0),
|
||||
n_features_to_select=None, step=1, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert rfe.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch(print_changed_only_false):
|
||||
# render a gridsearch
|
||||
param_grid = [
|
||||
{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
|
||||
{"kernel": ["linear"], "C": [1, 10, 100, 1000]},
|
||||
]
|
||||
gs = GridSearchCV(SVC(), param_grid, cv=5)
|
||||
|
||||
expected = """
|
||||
GridSearchCV(cv=5, error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
|
||||
'kernel': ['rbf']},
|
||||
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert gs.__repr__() == expected
|
||||
|
||||
|
||||
def test_gridsearch_pipeline(print_changed_only_false):
|
||||
# render a pipeline inside a gridsearch
|
||||
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
|
||||
|
||||
pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
|
||||
N_FEATURES_OPTIONS = [2, 4, 8]
|
||||
C_OPTIONS = [1, 10, 100, 1000]
|
||||
param_grid = [
|
||||
{
|
||||
"reduce_dim": [PCA(iterated_power=7), NMF()],
|
||||
"reduce_dim__n_components": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
{
|
||||
"reduce_dim": [SelectKBest(chi2)],
|
||||
"reduce_dim__k": N_FEATURES_OPTIONS,
|
||||
"classify__C": C_OPTIONS,
|
||||
},
|
||||
]
|
||||
gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv=3, error_score='raise-deprecating',
|
||||
estimator=Pipeline(memory=None,
|
||||
steps=[('reduce_dim',
|
||||
PCA(copy=True, iterated_power='auto',
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False)),
|
||||
('classify',
|
||||
SVC(C=1.0, cache_size=200,
|
||||
class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr',
|
||||
degree=3, gamma='auto_deprecated',
|
||||
kernel='rbf', max_iter=-1,
|
||||
probability=False,
|
||||
random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False))]),
|
||||
iid='warn', n_jobs=1,
|
||||
param_grid=[{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [PCA(copy=True, iterated_power=7,
|
||||
n_components=None,
|
||||
random_state=None,
|
||||
svd_solver='auto', tol=0.0,
|
||||
whiten=False),
|
||||
NMF(alpha=0.0, beta_loss='frobenius',
|
||||
init=None, l1_ratio=0.0,
|
||||
max_iter=200, n_components=None,
|
||||
random_state=None, shuffle=False,
|
||||
solver='cd', tol=0.0001,
|
||||
verbose=0)],
|
||||
'reduce_dim__n_components': [2, 4, 8]},
|
||||
{'classify__C': [1, 10, 100, 1000],
|
||||
'reduce_dim': [SelectKBest(k=10,
|
||||
score_func=<function chi2 at some_address>)],
|
||||
'reduce_dim__k': [2, 4, 8]}],
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
repr_ = pp.pformat(gspipline)
|
||||
# Remove address of '<function chi2 at 0x.....>' for reproducibility
|
||||
repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
|
||||
assert repr_ == expected
|
||||
|
||||
|
||||
def test_n_max_elements_to_show(print_changed_only_false):
|
||||
n_max_elements_to_show = 30
|
||||
pp = _EstimatorPrettyPrinter(
|
||||
compact=True,
|
||||
indent=1,
|
||||
indent_at_name=True,
|
||||
n_max_elements_to_show=n_max_elements_to_show,
|
||||
)
|
||||
|
||||
# No ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
|
||||
vectorizer = CountVectorizer(vocabulary=vocabulary)
|
||||
|
||||
expected = r"""
|
||||
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
|
||||
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
|
||||
lowercase=True, max_df=1.0, max_features=None, min_df=1,
|
||||
ngram_range=(1, 1), preprocessor=None, stop_words=None,
|
||||
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
|
||||
tokenizer=None,
|
||||
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
|
||||
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
|
||||
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
|
||||
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
|
||||
27: 27, 28: 28, 29: 29, ...})"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(vectorizer) == expected
|
||||
|
||||
# Also test with lists
|
||||
param_grid = {"C": list(range(n_max_elements_to_show))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
# Now with ellipsis
|
||||
param_grid = {"C": list(range(n_max_elements_to_show + 1))}
|
||||
gs = GridSearchCV(SVC(), param_grid)
|
||||
expected = """
|
||||
GridSearchCV(cv='warn', error_score='raise-deprecating',
|
||||
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
|
||||
decision_function_shape='ovr', degree=3,
|
||||
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
|
||||
probability=False, random_state=None, shrinking=True,
|
||||
tol=0.001, verbose=False),
|
||||
iid='warn', n_jobs=None,
|
||||
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
|
||||
27, 28, 29, ...]},
|
||||
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
|
||||
scoring=None, verbose=0)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert pp.pformat(gs) == expected
|
||||
|
||||
|
||||
def test_bruteforce_ellipsis(print_changed_only_false):
|
||||
# Check that the bruteforce ellipsis (used when the number of non-blank
|
||||
# characters exceeds N_CHAR_MAX) renders correctly.
|
||||
|
||||
lr = LogisticRegression()
|
||||
|
||||
# test when the left and right side of the ellipsis aren't on the same
|
||||
# line.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
in...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=150)
|
||||
|
||||
# test with very small N_CHAR_MAX
|
||||
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
|
||||
# weird reprs we still keep the whole line of the right part (after the
|
||||
# ellipsis).
|
||||
expected = """
|
||||
Lo...
|
||||
warm_start=False)"""
|
||||
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters: In this case we
|
||||
# don't want ellipsis
|
||||
full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
|
||||
n_nonblank = len("".join(full_repr.split()))
|
||||
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
|
||||
assert "..." not in full_repr
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on different lines. In this case we
|
||||
# want to expend the whole line of the right side
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_i...
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
|
||||
# right side of the ellispsis are on the same line. In this case we don't
|
||||
# want to expend the whole line of the right side, just add the ellispsis
|
||||
# between the 2 sides.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter...,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
|
||||
|
||||
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
|
||||
# right side of the ellispsis are on the same line, but adding the ellipsis
|
||||
# would actually make the repr longer. So we don't add the ellipsis.
|
||||
expected = """
|
||||
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
|
||||
intercept_scaling=1, l1_ratio=None, max_iter=100,
|
||||
multi_class='warn', n_jobs=None, penalty='l2',
|
||||
random_state=None, solver='warn', tol=0.0001, verbose=0,
|
||||
warm_start=False)"""
|
||||
expected = expected[1:] # remove first \n
|
||||
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
|
||||
|
||||
|
||||
def test_builtin_prettyprinter():
|
||||
# non regression test than ensures we can still use the builtin
|
||||
# PrettyPrinter class for estimators (as done e.g. by joblib).
|
||||
# Used to be a bug
|
||||
|
||||
PrettyPrinter().pprint(LogisticRegression())
|
||||
|
||||
|
||||
def test_kwargs_in_init():
|
||||
# Make sure the changed_only=True mode is OK when an argument is passed as
|
||||
# kwargs.
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/17206
|
||||
|
||||
class WithKWargs(BaseEstimator):
|
||||
# Estimator with a kwargs argument. These need to hack around
|
||||
# set_params and get_params. Here we mimic what LightGBM does.
|
||||
def __init__(self, a="willchange", b="unchanged", **kwargs):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self._other_params = {}
|
||||
self.set_params(**kwargs)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
params = super().get_params(deep=deep)
|
||||
params.update(self._other_params)
|
||||
return params
|
||||
|
||||
def set_params(self, **params):
|
||||
for key, value in params.items():
|
||||
setattr(self, key, value)
|
||||
self._other_params[key] = value
|
||||
return self
|
||||
|
||||
est = WithKWargs(a="something", c="abcd", d=None)
|
||||
|
||||
expected = "WithKWargs(a='something', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
||||
|
||||
with config_context(print_changed_only=False):
|
||||
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
|
||||
assert expected == est.__repr__()
|
||||
|
||||
|
||||
def test_complexity_print_changed_only():
|
||||
# Make sure `__repr__` is called the same amount of times
|
||||
# whether `print_changed_only` is True or False
|
||||
# Non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/18490
|
||||
|
||||
class DummyEstimator(TransformerMixin, BaseEstimator):
|
||||
nb_times_repr_called = 0
|
||||
|
||||
def __init__(self, estimator=None):
|
||||
self.estimator = estimator
|
||||
|
||||
def __repr__(self):
|
||||
DummyEstimator.nb_times_repr_called += 1
|
||||
return super().__repr__()
|
||||
|
||||
def transform(self, X, copy=None): # pragma: no cover
|
||||
return X
|
||||
|
||||
estimator = DummyEstimator(
|
||||
make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
|
||||
)
|
||||
with config_context(print_changed_only=False):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
|
||||
|
||||
DummyEstimator.nb_times_repr_called = 0
|
||||
with config_context(print_changed_only=True):
|
||||
repr(estimator)
|
||||
nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
|
||||
|
||||
assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true
|
||||
@@ -0,0 +1,192 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from scipy.special import comb
|
||||
|
||||
from sklearn.utils._random import _our_rand_r_py
|
||||
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
|
||||
|
||||
|
||||
###############################################################################
|
||||
# test custom sampling without replacement algorithm
|
||||
###############################################################################
|
||||
def test_invalid_sample_without_replacement_algorithm():
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, 4, "unknown")
|
||||
|
||||
|
||||
def test_sample_without_replacement_algorithms():
|
||||
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
|
||||
|
||||
for m in methods:
|
||||
|
||||
def sample_without_replacement_method(
|
||||
n_population, n_samples, random_state=None
|
||||
):
|
||||
return sample_without_replacement(
|
||||
n_population, n_samples, method=m, random_state=random_state
|
||||
)
|
||||
|
||||
check_edge_case_of_sample_int(sample_without_replacement_method)
|
||||
check_sample_int(sample_without_replacement_method)
|
||||
check_sample_int_distribution(sample_without_replacement_method)
|
||||
|
||||
|
||||
def check_edge_case_of_sample_int(sample_without_replacement):
|
||||
# n_population < n_sample
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(0, 1)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(1, 2)
|
||||
|
||||
# n_population == n_samples
|
||||
assert sample_without_replacement(0, 0).shape == (0,)
|
||||
|
||||
assert sample_without_replacement(1, 1).shape == (1,)
|
||||
|
||||
# n_population >= n_samples
|
||||
assert sample_without_replacement(5, 0).shape == (0,)
|
||||
assert sample_without_replacement(5, 1).shape == (1,)
|
||||
|
||||
# n_population < 0 or n_samples < 0
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(-1, 5)
|
||||
with pytest.raises(ValueError):
|
||||
sample_without_replacement(5, -1)
|
||||
|
||||
|
||||
def check_sample_int(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# the sample is of the correct length and contains only unique items
|
||||
n_population = 100
|
||||
|
||||
for n_samples in range(n_population + 1):
|
||||
s = sample_without_replacement(n_population, n_samples)
|
||||
assert len(s) == n_samples
|
||||
unique = np.unique(s)
|
||||
assert np.size(unique) == n_samples
|
||||
assert np.all(unique < n_population)
|
||||
|
||||
# test edge case n_population == n_samples == 0
|
||||
assert np.size(sample_without_replacement(0, 0)) == 0
|
||||
|
||||
|
||||
def check_sample_int_distribution(sample_without_replacement):
|
||||
# This test is heavily inspired from test_random.py of python-core.
|
||||
#
|
||||
# For the entire allowable range of 0 <= k <= N, validate that
|
||||
# sample generates all possible permutations
|
||||
n_population = 10
|
||||
|
||||
# a large number of trials prevents false negatives without slowing normal
|
||||
# case
|
||||
n_trials = 10000
|
||||
|
||||
for n_samples in range(n_population):
|
||||
# Counting the number of combinations is not as good as counting the
|
||||
# the number of permutations. However, it works with sampling algorithm
|
||||
# that does not provide a random permutation of the subset of integer.
|
||||
n_expected = comb(n_population, n_samples, exact=True)
|
||||
|
||||
output = {}
|
||||
for i in range(n_trials):
|
||||
output[frozenset(sample_without_replacement(n_population, n_samples))] = (
|
||||
None
|
||||
)
|
||||
|
||||
if len(output) == n_expected:
|
||||
break
|
||||
else:
|
||||
raise AssertionError(
|
||||
"number of combinations != number of expected (%s != %s)"
|
||||
% (len(output), n_expected)
|
||||
)
|
||||
|
||||
|
||||
def test_random_choice_csc(n_samples=10000, random_state=24):
|
||||
# Explicit class probabilities
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Implicit class probabilities
|
||||
classes = [[0, 1], [1, 2]] # test for array-like support
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# Edge case probabilities 1.0 and 0.0
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
|
||||
|
||||
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = (
|
||||
np.bincount(
|
||||
got.getcol(k).toarray().ravel(), minlength=len(class_probabilities[k])
|
||||
)
|
||||
/ n_samples
|
||||
)
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
# One class target data
|
||||
classes = [[1], [0]] # test for array-like support
|
||||
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
|
||||
|
||||
got = _random_choice_csc(
|
||||
n_samples=n_samples, classes=classes, random_state=random_state
|
||||
)
|
||||
assert sp.issparse(got)
|
||||
|
||||
for k in range(len(classes)):
|
||||
p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
|
||||
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
|
||||
|
||||
|
||||
def test_random_choice_csc_errors():
|
||||
# the length of an array in classes and class_probabilities is mismatched
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# the class dtype is not supported
|
||||
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
|
||||
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
# Given probabilities don't sum to 1
|
||||
classes = [np.array([0, 1]), np.array([0, 1, 2])]
|
||||
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
|
||||
with pytest.raises(ValueError):
|
||||
_random_choice_csc(4, classes, class_probabilities, 1)
|
||||
|
||||
|
||||
def test_our_rand_r():
|
||||
assert 131541053 == _our_rand_r_py(1273642419)
|
||||
assert 270369 == _our_rand_r_py(0)
|
||||
@@ -0,0 +1,371 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.datasets import (
|
||||
load_iris,
|
||||
make_classification,
|
||||
make_multilabel_classification,
|
||||
make_regression,
|
||||
)
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.linear_model import (
|
||||
LinearRegression,
|
||||
LogisticRegression,
|
||||
)
|
||||
from sklearn.multioutput import ClassifierChain
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
|
||||
from sklearn.utils._response import _get_response_values, _get_response_values_binary
|
||||
from sklearn.utils._testing import assert_allclose, assert_array_equal
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
# scale the data to avoid ConvergenceWarning with LogisticRegression
|
||||
X = scale(X, copy=False)
|
||||
X_binary, y_binary = X[:100], y[:100]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
|
||||
)
|
||||
def test_get_response_values_regressor_error(response_method):
|
||||
"""Check the error message with regressor an not supported response
|
||||
method."""
|
||||
my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method])
|
||||
X = "mocking_data", "mocking_target"
|
||||
err_msg = f"{my_estimator.__class__.__name__} should either be a classifier"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(my_estimator, X, response_method=response_method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_regressor(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values` with regressor."""
|
||||
X, y = make_regression(n_samples=10, random_state=0)
|
||||
regressor = LinearRegression().fit(X, y)
|
||||
results = _get_response_values(
|
||||
regressor,
|
||||
X,
|
||||
response_method="predict",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_array_equal(results[0], regressor.predict(X))
|
||||
assert results[1] is None
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method",
|
||||
["predict", "decision_function", ["decision_function", "predict"]],
|
||||
)
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_outlier_detection(
|
||||
response_method, return_response_method_used
|
||||
):
|
||||
"""Check the behaviour of `_get_response_values` with outlier detector."""
|
||||
X, y = make_classification(n_samples=50, random_state=0)
|
||||
outlier_detector = IsolationForest(random_state=0).fit(X, y)
|
||||
results = _get_response_values(
|
||||
outlier_detector,
|
||||
X,
|
||||
response_method=response_method,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
chosen_response_method = (
|
||||
response_method[0] if isinstance(response_method, list) else response_method
|
||||
)
|
||||
prediction_method = getattr(outlier_detector, chosen_response_method)
|
||||
assert_array_equal(results[0], prediction_method(X))
|
||||
assert results[1] is None
|
||||
if return_response_method_used:
|
||||
assert results[2] == chosen_response_method
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method",
|
||||
["predict_proba", "decision_function", "predict", "predict_log_proba"],
|
||||
)
|
||||
def test_get_response_values_classifier_unknown_pos_label(response_method):
|
||||
"""Check that `_get_response_values` raises the proper error message with
|
||||
classifier."""
|
||||
X, y = make_classification(n_samples=10, n_classes=2, random_state=0)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
|
||||
# provide a `pos_label` which is not in `y`
|
||||
err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label="whatever",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
|
||||
def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba(
|
||||
response_method,
|
||||
):
|
||||
"""Check that `_get_response_values` will raise an error when `y_pred` has a
|
||||
single class with `predict_proba`."""
|
||||
X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0)
|
||||
y_single_class = np.zeros_like(y_two_class)
|
||||
classifier = DecisionTreeClassifier().fit(X, y_single_class)
|
||||
|
||||
err_msg = (
|
||||
r"Got predict_proba of shape \(10, 1\), but need classifier with "
|
||||
r"two classes"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values(classifier, X, response_method=response_method)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_values_binary_classifier_decision_function(
|
||||
return_response_method_used,
|
||||
):
|
||||
"""Check the behaviour of `_get_response_values` with `decision_function`
|
||||
and binary classifier."""
|
||||
X, y = make_classification(
|
||||
n_samples=10,
|
||||
n_classes=2,
|
||||
weights=[0.3, 0.7],
|
||||
random_state=0,
|
||||
)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
response_method = "decision_function"
|
||||
|
||||
# default `pos_label`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=None,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X))
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
# when forcing `pos_label=classifier.classes_[0]`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=classifier.classes_[0],
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X) * -1)
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
|
||||
def test_get_response_values_binary_classifier_predict_proba(
|
||||
return_response_method_used, response_method
|
||||
):
|
||||
"""Check that `_get_response_values` with `predict_proba` and binary
|
||||
classifier."""
|
||||
X, y = make_classification(
|
||||
n_samples=10,
|
||||
n_classes=2,
|
||||
weights=[0.3, 0.7],
|
||||
random_state=0,
|
||||
)
|
||||
classifier = LogisticRegression().fit(X, y)
|
||||
|
||||
# default `pos_label`
|
||||
results = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=None,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1])
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert len(results) == 3
|
||||
assert results[2] == response_method
|
||||
else:
|
||||
assert len(results) == 2
|
||||
|
||||
# when forcing `pos_label=classifier.classes_[0]`
|
||||
y_pred, pos_label, *_ = _get_response_values(
|
||||
classifier,
|
||||
X,
|
||||
response_method=response_method,
|
||||
pos_label=classifier.classes_[0],
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0])
|
||||
assert pos_label == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, X, y, err_msg, params",
|
||||
[
|
||||
(
|
||||
DecisionTreeRegressor(),
|
||||
X_binary,
|
||||
y_binary,
|
||||
"Expected 'estimator' to be a binary classifier",
|
||||
{"response_method": "auto"},
|
||||
),
|
||||
(
|
||||
DecisionTreeClassifier(),
|
||||
X_binary,
|
||||
y_binary,
|
||||
r"pos_label=unknown is not a valid label: It should be one of \[0 1\]",
|
||||
{"response_method": "auto", "pos_label": "unknown"},
|
||||
),
|
||||
(
|
||||
DecisionTreeClassifier(),
|
||||
X,
|
||||
y,
|
||||
"be a binary classifier. Got 3 classes instead.",
|
||||
{"response_method": "predict_proba"},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_response_error(estimator, X, y, err_msg, params):
|
||||
"""Check that we raise the proper error messages in _get_response_values_binary."""
|
||||
|
||||
estimator.fit(X, y)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_response_values_binary(estimator, X, **params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_predict_proba(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
|
||||
classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="predict_proba",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict_proba"
|
||||
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="predict_proba",
|
||||
pos_label=0,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "predict_proba"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("return_response_method_used", [True, False])
|
||||
def test_get_response_decision_function(return_response_method_used):
|
||||
"""Check the behaviour of `_get_response_values_binary` using decision_function."""
|
||||
classifier = LogisticRegression().fit(X_binary, y_binary)
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="decision_function",
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X_binary))
|
||||
assert results[1] == 1
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
results = _get_response_values_binary(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method="decision_function",
|
||||
pos_label=0,
|
||||
return_response_method_used=return_response_method_used,
|
||||
)
|
||||
assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
|
||||
assert results[1] == 0
|
||||
if return_response_method_used:
|
||||
assert results[2] == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, response_method",
|
||||
[
|
||||
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"),
|
||||
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"),
|
||||
(LogisticRegression(), "decision_function"),
|
||||
],
|
||||
)
|
||||
def test_get_response_values_multiclass(estimator, response_method):
|
||||
"""Check that we can call `_get_response_values` with a multiclass estimator.
|
||||
It should return the predictions untouched.
|
||||
"""
|
||||
estimator.fit(X, y)
|
||||
predictions, pos_label = _get_response_values(
|
||||
estimator, X, response_method=response_method
|
||||
)
|
||||
|
||||
assert pos_label is None
|
||||
assert predictions.shape == (X.shape[0], len(estimator.classes_))
|
||||
if response_method == "predict_proba":
|
||||
assert np.logical_and(predictions >= 0, predictions <= 1).all()
|
||||
elif response_method == "predict_log_proba":
|
||||
assert (predictions <= 0.0).all()
|
||||
|
||||
|
||||
def test_get_response_values_with_response_list():
|
||||
"""Check the behaviour of passing a list of responses to `_get_response_values`."""
|
||||
classifier = LogisticRegression().fit(X_binary, y_binary)
|
||||
|
||||
# it should use `predict_proba`
|
||||
y_pred, pos_label, response_method = _get_response_values(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method=["predict_proba", "decision_function"],
|
||||
return_response_method_used=True,
|
||||
)
|
||||
assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1])
|
||||
assert pos_label == 1
|
||||
assert response_method == "predict_proba"
|
||||
|
||||
# it should use `decision_function`
|
||||
y_pred, pos_label, response_method = _get_response_values(
|
||||
classifier,
|
||||
X_binary,
|
||||
response_method=["decision_function", "predict_proba"],
|
||||
return_response_method_used=True,
|
||||
)
|
||||
assert_allclose(y_pred, classifier.decision_function(X_binary))
|
||||
assert pos_label == 1
|
||||
assert response_method == "decision_function"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"response_method", ["predict_proba", "decision_function", "predict"]
|
||||
)
|
||||
def test_get_response_values_multilabel_indicator(response_method):
|
||||
X, Y = make_multilabel_classification(random_state=0)
|
||||
estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
|
||||
|
||||
y_pred, pos_label = _get_response_values(
|
||||
estimator, X, response_method=response_method
|
||||
)
|
||||
assert pos_label is None
|
||||
assert y_pred.shape == Y.shape
|
||||
|
||||
if response_method == "predict_proba":
|
||||
assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
|
||||
elif response_method == "decision_function":
|
||||
# values returned by `decision_function` are not bounded in [0, 1]
|
||||
assert (y_pred < 0).sum() > 0
|
||||
assert (y_pred > 1).sum() > 0
|
||||
else: # response_method == "predict"
|
||||
assert np.logical_or(y_pred == 0, y_pred == 1).all()
|
||||
@@ -0,0 +1,185 @@
|
||||
# Author: Tom Dupre la Tour
|
||||
# Joan Massich <mailsik@gmail.com>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.utils._seq_dataset import (
|
||||
ArrayDataset32,
|
||||
ArrayDataset64,
|
||||
CSRDataset32,
|
||||
CSRDataset64,
|
||||
)
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
iris = load_iris()
|
||||
X64 = iris.data.astype(np.float64)
|
||||
y64 = iris.target.astype(np.float64)
|
||||
sample_weight64 = np.arange(y64.size, dtype=np.float64)
|
||||
|
||||
X32 = iris.data.astype(np.float32)
|
||||
y32 = iris.target.astype(np.float32)
|
||||
sample_weight32 = np.arange(y32.size, dtype=np.float32)
|
||||
|
||||
floating = [np.float32, np.float64]
|
||||
|
||||
|
||||
def assert_csr_equal_values(current, expected):
|
||||
current.eliminate_zeros()
|
||||
expected.eliminate_zeros()
|
||||
expected = expected.astype(current.dtype)
|
||||
assert current.shape[0] == expected.shape[0]
|
||||
assert current.shape[1] == expected.shape[1]
|
||||
assert_array_equal(current.data, expected.data)
|
||||
assert_array_equal(current.indices, expected.indices)
|
||||
assert_array_equal(current.indptr, expected.indptr)
|
||||
|
||||
|
||||
def _make_dense_dataset(float_dtype):
|
||||
if float_dtype == np.float32:
|
||||
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
|
||||
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
|
||||
|
||||
|
||||
def _make_sparse_dataset(csr_container, float_dtype):
|
||||
if float_dtype == np.float32:
|
||||
X, y, sample_weight, csr_dataset = X32, y32, sample_weight32, CSRDataset32
|
||||
else:
|
||||
X, y, sample_weight, csr_dataset = X64, y64, sample_weight64, CSRDataset64
|
||||
X = csr_container(X)
|
||||
return csr_dataset(X.data, X.indptr, X.indices, y, sample_weight, seed=42)
|
||||
|
||||
|
||||
def _make_dense_datasets():
|
||||
return [_make_dense_dataset(float_dtype) for float_dtype in floating]
|
||||
|
||||
|
||||
def _make_sparse_datasets():
|
||||
return [
|
||||
_make_sparse_dataset(csr_container, float_dtype)
|
||||
for csr_container, float_dtype in product(CSR_CONTAINERS, floating)
|
||||
]
|
||||
|
||||
|
||||
def _make_fused_types_datasets():
|
||||
all_datasets = _make_dense_datasets() + _make_sparse_datasets()
|
||||
# group dataset by array types to get a tuple (float32, float64)
|
||||
return (all_datasets[idx : idx + 2] for idx in range(0, len(all_datasets), 2))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("dataset", _make_dense_datasets() + _make_sparse_datasets())
|
||||
def test_seq_dataset_basic_iteration(dataset, csr_container):
|
||||
NUMBER_OF_RUNS = 5
|
||||
X_csr64 = csr_container(X64)
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
xi_, yi, swi, idx = dataset._next_py()
|
||||
xi = csr_container(xi_, shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[[idx]])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
# random sample
|
||||
xi_, yi, swi, idx = dataset._random_py()
|
||||
xi = csr_container(xi_, shape=(1, X64.shape[1]))
|
||||
|
||||
assert_csr_equal_values(xi, X_csr64[[idx]])
|
||||
assert yi == y64[idx]
|
||||
assert swi == sample_weight64[idx]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dense_dataset,sparse_dataset",
|
||||
[
|
||||
(
|
||||
_make_dense_dataset(float_dtype),
|
||||
_make_sparse_dataset(csr_container, float_dtype),
|
||||
)
|
||||
for float_dtype, csr_container in product(floating, CSR_CONTAINERS)
|
||||
],
|
||||
)
|
||||
def test_seq_dataset_shuffle(dense_dataset, sparse_dataset):
|
||||
# not shuffled
|
||||
for i in range(5):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
for i in [132, 50, 9, 18, 58]:
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
seed = 77
|
||||
dense_dataset._shuffle_py(seed)
|
||||
sparse_dataset._shuffle_py(seed)
|
||||
|
||||
idx_next = [63, 91, 148, 87, 29]
|
||||
idx_shuffle = [137, 125, 56, 121, 127]
|
||||
for i, j in zip(idx_next, idx_shuffle):
|
||||
_, _, _, idx1 = dense_dataset._next_py()
|
||||
_, _, _, idx2 = sparse_dataset._next_py()
|
||||
assert idx1 == i
|
||||
assert idx2 == i
|
||||
|
||||
_, _, _, idx1 = dense_dataset._random_py()
|
||||
_, _, _, idx2 = sparse_dataset._random_py()
|
||||
assert idx1 == j
|
||||
assert idx2 == j
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataset_32,dataset_64", _make_fused_types_datasets())
|
||||
def test_fused_types_consistency(dataset_32, dataset_64):
|
||||
NUMBER_OF_RUNS = 5
|
||||
for _ in range(NUMBER_OF_RUNS):
|
||||
# next sample
|
||||
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
|
||||
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
|
||||
|
||||
assert xi_data32.dtype == np.float32
|
||||
assert xi_data64.dtype == np.float64
|
||||
|
||||
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
|
||||
assert_allclose(yi64, yi32, rtol=1e-5)
|
||||
|
||||
|
||||
def test_buffer_dtype_mismatch_error():
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset64(X32, y32, sample_weight32, seed=42),
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
ArrayDataset32(X64, y64, sample_weight64, seed=42),
|
||||
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
X_csr32 = csr_container(X32)
|
||||
X_csr64 = csr_container(X64)
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset64(
|
||||
X_csr32.data,
|
||||
X_csr32.indptr,
|
||||
X_csr32.indices,
|
||||
y32,
|
||||
sample_weight32,
|
||||
seed=42,
|
||||
),
|
||||
|
||||
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
|
||||
CSRDataset32(
|
||||
X_csr64.data,
|
||||
X_csr64.indptr,
|
||||
X_csr64.indices,
|
||||
y64,
|
||||
sample_weight64,
|
||||
seed=42,
|
||||
),
|
||||
@@ -0,0 +1,464 @@
|
||||
import importlib
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn._config import config_context, get_config
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils._set_output import (
|
||||
ADAPTERS_MANAGER,
|
||||
ContainerAdapterProtocol,
|
||||
_get_adapter_from_container,
|
||||
_get_output_config,
|
||||
_safe_set_output,
|
||||
_SetOutputMixin,
|
||||
_wrap_data_with_container,
|
||||
check_library_installed,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def test_pandas_adapter():
|
||||
"""Check pandas adapter has expected behavior."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
columns = np.asarray(["f0", "f1", "f2"], dtype=object)
|
||||
index = np.asarray([0, 1])
|
||||
X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
|
||||
adapter = ADAPTERS_MANAGER.adapters["pandas"]
|
||||
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
|
||||
assert isinstance(X_container, pd.DataFrame)
|
||||
assert_array_equal(X_container.columns, columns)
|
||||
assert_array_equal(X_container.index, index)
|
||||
|
||||
# Input dataframe's index does not change
|
||||
new_columns = np.asarray(["f0", "f1"], dtype=object)
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
|
||||
new_df = adapter.create_container(X_df, X_df_orig, columns=new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
assert_array_equal(new_df.index, X_df.index)
|
||||
|
||||
assert adapter.is_supported_container(X_df)
|
||||
assert not adapter.is_supported_container(X_np)
|
||||
|
||||
# adapter.update_columns updates the columns
|
||||
new_columns = np.array(["a", "c"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# adapter.hstack stacks the dataframes horizontally.
|
||||
X_df_1 = pd.DataFrame([[1, 2, 5], [3, 4, 6]], columns=["a", "b", "e"])
|
||||
X_df_2 = pd.DataFrame([[4], [5]], columns=["c"])
|
||||
X_stacked = adapter.hstack([X_df_1, X_df_2])
|
||||
|
||||
expected_df = pd.DataFrame(
|
||||
[[1, 2, 5, 4], [3, 4, 6, 5]], columns=["a", "b", "e", "c"]
|
||||
)
|
||||
pd.testing.assert_frame_equal(X_stacked, expected_df)
|
||||
|
||||
# check that we update properly the columns even with duplicate column names
|
||||
# this use-case potentially happen when using ColumnTransformer
|
||||
# non-regression test for gh-28260
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
|
||||
new_columns = np.array(["x__a", "y__a"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# check the behavior of the inplace parameter in `create_container`
|
||||
# we should trigger a copy
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=False)
|
||||
assert X_output is not X_df
|
||||
assert list(X_df.columns) == [0, 1]
|
||||
assert list(X_output.columns) == ["a", "b"]
|
||||
|
||||
# the operation is inplace
|
||||
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=True)
|
||||
assert X_output is X_df
|
||||
assert list(X_df.columns) == ["a", "b"]
|
||||
assert list(X_output.columns) == ["a", "b"]
|
||||
|
||||
|
||||
def test_polars_adapter():
|
||||
"""Check Polars adapter has expected behavior."""
|
||||
pl = pytest.importorskip("polars")
|
||||
X_np = np.array([[1, 0, 3], [0, 0, 1]])
|
||||
columns = ["f1", "f2", "f3"]
|
||||
X_df_orig = pl.DataFrame(X_np, schema=columns, orient="row")
|
||||
|
||||
adapter = ADAPTERS_MANAGER.adapters["polars"]
|
||||
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
|
||||
|
||||
assert isinstance(X_container, pl.DataFrame)
|
||||
assert_array_equal(X_container.columns, columns)
|
||||
|
||||
# Update columns with create_container
|
||||
new_columns = np.asarray(["a", "b", "c"], dtype=object)
|
||||
new_df = adapter.create_container(X_df_orig, X_df_orig, columns=new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
assert adapter.is_supported_container(X_df_orig)
|
||||
assert not adapter.is_supported_container(X_np)
|
||||
|
||||
# adapter.update_columns updates the columns
|
||||
new_columns = np.array(["a", "c", "g"], dtype=object)
|
||||
new_df = adapter.rename_columns(X_df_orig, new_columns)
|
||||
assert_array_equal(new_df.columns, new_columns)
|
||||
|
||||
# adapter.hstack stacks the dataframes horizontally.
|
||||
X_df_1 = pl.DataFrame([[1, 2, 5], [3, 4, 6]], schema=["a", "b", "e"], orient="row")
|
||||
X_df_2 = pl.DataFrame([[4], [5]], schema=["c"], orient="row")
|
||||
X_stacked = adapter.hstack([X_df_1, X_df_2])
|
||||
|
||||
expected_df = pl.DataFrame(
|
||||
[[1, 2, 5, 4], [3, 4, 6, 5]], schema=["a", "b", "e", "c"], orient="row"
|
||||
)
|
||||
from polars.testing import assert_frame_equal
|
||||
|
||||
assert_frame_equal(X_stacked, expected_df)
|
||||
|
||||
# check the behavior of the inplace parameter in `create_container`
|
||||
# we should trigger a copy
|
||||
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=False)
|
||||
assert X_output is not X_df
|
||||
assert list(X_df.columns) == ["a", "b"]
|
||||
assert list(X_output.columns) == ["c", "d"]
|
||||
|
||||
# the operation is inplace
|
||||
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
|
||||
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=True)
|
||||
assert X_output is X_df
|
||||
assert list(X_df.columns) == ["c", "d"]
|
||||
assert list(X_output.columns) == ["c", "d"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test__container_error_validation(csr_container):
|
||||
"""Check errors in _wrap_data_with_container."""
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
X_csr = csr_container(X)
|
||||
match = "The transformer outputs a scipy sparse matrix."
|
||||
with config_context(transform_output="pandas"):
|
||||
with pytest.raises(ValueError, match=match):
|
||||
_wrap_data_with_container("transform", X_csr, X, StandardScaler())
|
||||
|
||||
|
||||
class EstimatorWithoutSetOutputAndWithoutTransform:
|
||||
pass
|
||||
|
||||
|
||||
class EstimatorNoSetOutputWithTransform:
|
||||
def transform(self, X, y=None):
|
||||
return X # pragma: no cover
|
||||
|
||||
|
||||
class EstimatorWithSetOutput(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
self.n_features_in_ = X.shape[1]
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
def test__safe_set_output():
|
||||
"""Check _safe_set_output works as expected."""
|
||||
|
||||
# Estimator without transform will not raise when setting set_output for transform.
|
||||
est = EstimatorWithoutSetOutputAndWithoutTransform()
|
||||
_safe_set_output(est, transform="pandas")
|
||||
|
||||
# Estimator with transform but without set_output will raise
|
||||
est = EstimatorNoSetOutputWithTransform()
|
||||
with pytest.raises(ValueError, match="Unable to configure output"):
|
||||
_safe_set_output(est, transform="pandas")
|
||||
|
||||
est = EstimatorWithSetOutput().fit(np.asarray([[1, 2, 3]]))
|
||||
_safe_set_output(est, transform="pandas")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "pandas"
|
||||
|
||||
_safe_set_output(est, transform="default")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
# transform is None is a no-op, so the config remains "default"
|
||||
_safe_set_output(est, transform=None)
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
|
||||
class EstimatorNoSetOutputWithTransformNoFeatureNamesOut(_SetOutputMixin):
|
||||
def transform(self, X, y=None):
|
||||
return X # pragma: no cover
|
||||
|
||||
|
||||
def test_set_output_mixin():
|
||||
"""Estimator without get_feature_names_out does not define `set_output`."""
|
||||
est = EstimatorNoSetOutputWithTransformNoFeatureNamesOut()
|
||||
assert not hasattr(est, "set_output")
|
||||
|
||||
|
||||
def test__safe_set_output_error():
|
||||
"""Check transform with invalid config."""
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
|
||||
est = EstimatorWithSetOutput()
|
||||
_safe_set_output(est, transform="bad")
|
||||
|
||||
msg = "output config must be in"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
def test_set_output_method(dataframe_lib):
|
||||
"""Check that the output is a dataframe."""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
est = EstimatorWithSetOutput().fit(X)
|
||||
|
||||
# transform=None is a no-op
|
||||
est2 = est.set_output(transform=None)
|
||||
assert est2 is est
|
||||
X_trans_np = est2.transform(X)
|
||||
assert isinstance(X_trans_np, np.ndarray)
|
||||
|
||||
est.set_output(transform=dataframe_lib)
|
||||
|
||||
X_trans_pd = est.transform(X)
|
||||
|
||||
assert isinstance(X_trans_pd, lib.DataFrame)
|
||||
|
||||
|
||||
def test_set_output_method_error():
|
||||
"""Check transform fails with invalid transform."""
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
est = EstimatorWithSetOutput().fit(X)
|
||||
est.set_output(transform="bad")
|
||||
|
||||
msg = "output config must be in"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
est.transform(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("transform_output", ["pandas", "polars"])
|
||||
def test__get_output_config(transform_output):
|
||||
"""Check _get_output_config works as expected."""
|
||||
|
||||
# Without a configuration set, the global config is used
|
||||
global_config = get_config()["transform_output"]
|
||||
config = _get_output_config("transform")
|
||||
assert config["dense"] == global_config
|
||||
|
||||
with config_context(transform_output=transform_output):
|
||||
# with estimator=None, the global config is used
|
||||
config = _get_output_config("transform")
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
est = EstimatorNoSetOutputWithTransform()
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
est = EstimatorWithSetOutput()
|
||||
# If estimator has not config, use global config
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
# If estimator has a config, use local config
|
||||
est.set_output(transform="default")
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == "default"
|
||||
|
||||
est.set_output(transform=transform_output)
|
||||
config = _get_output_config("transform", est)
|
||||
assert config["dense"] == transform_output
|
||||
|
||||
|
||||
class EstimatorWithSetOutputNoAutoWrap(_SetOutputMixin, auto_wrap_output_keys=None):
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
|
||||
def test_get_output_auto_wrap_false():
|
||||
"""Check that auto_wrap_output_keys=None does not wrap."""
|
||||
est = EstimatorWithSetOutputNoAutoWrap()
|
||||
assert not hasattr(est, "set_output")
|
||||
|
||||
X = np.asarray([[1, 0, 3], [0, 0, 1]])
|
||||
assert X is est.transform(X)
|
||||
|
||||
|
||||
def test_auto_wrap_output_keys_errors_with_incorrect_input():
|
||||
msg = "auto_wrap_output_keys must be None or a tuple of keys."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
|
||||
class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
|
||||
pass
|
||||
|
||||
|
||||
class AnotherMixin:
|
||||
def __init_subclass__(cls, custom_parameter, **kwargs):
|
||||
super().__init_subclass__(**kwargs)
|
||||
cls.custom_parameter = custom_parameter
|
||||
|
||||
|
||||
def test_set_output_mixin_custom_mixin():
|
||||
"""Check that multiple init_subclasses passes parameters up."""
|
||||
|
||||
class BothMixinEstimator(_SetOutputMixin, AnotherMixin, custom_parameter=123):
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return input_features
|
||||
|
||||
est = BothMixinEstimator()
|
||||
assert est.custom_parameter == 123
|
||||
assert hasattr(est, "set_output")
|
||||
|
||||
|
||||
def test_set_output_mro():
|
||||
"""Check that multi-inheritance resolves to the correct class method.
|
||||
|
||||
Non-regression test gh-25293.
|
||||
"""
|
||||
|
||||
class Base(_SetOutputMixin):
|
||||
def transform(self, X):
|
||||
return "Base" # noqa
|
||||
|
||||
class A(Base):
|
||||
pass
|
||||
|
||||
class B(Base):
|
||||
def transform(self, X):
|
||||
return "B"
|
||||
|
||||
class C(A, B):
|
||||
pass
|
||||
|
||||
assert C().transform(None) == "B"
|
||||
|
||||
|
||||
class EstimatorWithSetOutputIndex(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
self.n_features_in_ = X.shape[1]
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
import pandas as pd
|
||||
|
||||
# transform by giving output a new index.
|
||||
return pd.DataFrame(X.to_numpy(), index=[f"s{i}" for i in range(X.shape[0])])
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
def test_set_output_pandas_keep_index():
|
||||
"""Check that set_output does not override index.
|
||||
|
||||
Non-regression test for gh-25730.
|
||||
"""
|
||||
pd = pytest.importorskip("pandas")
|
||||
|
||||
X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=[0, 1])
|
||||
est = EstimatorWithSetOutputIndex().set_output(transform="pandas")
|
||||
est.fit(X)
|
||||
|
||||
X_trans = est.transform(X)
|
||||
assert_array_equal(X_trans.index, ["s0", "s1"])
|
||||
|
||||
|
||||
class EstimatorReturnTuple(_SetOutputMixin):
|
||||
def __init__(self, OutputTuple):
|
||||
self.OutputTuple = OutputTuple
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self.OutputTuple(X, 2 * X)
|
||||
|
||||
|
||||
def test_set_output_named_tuple_out():
|
||||
"""Check that namedtuples are kept by default."""
|
||||
Output = namedtuple("Output", "X, Y")
|
||||
X = np.asarray([[1, 2, 3]])
|
||||
est = EstimatorReturnTuple(OutputTuple=Output)
|
||||
X_trans = est.transform(X)
|
||||
|
||||
assert isinstance(X_trans, Output)
|
||||
assert_array_equal(X_trans.X, X)
|
||||
assert_array_equal(X_trans.Y, 2 * X)
|
||||
|
||||
|
||||
class EstimatorWithListInput(_SetOutputMixin):
|
||||
def fit(self, X, y=None):
|
||||
assert isinstance(X, list)
|
||||
self.n_features_in_ = len(X[0])
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return X
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
|
||||
def test_set_output_list_input(dataframe_lib):
|
||||
"""Check set_output for list input.
|
||||
|
||||
Non-regression test for #27037.
|
||||
"""
|
||||
lib = pytest.importorskip(dataframe_lib)
|
||||
|
||||
X = [[0, 1, 2, 3], [4, 5, 6, 7]]
|
||||
est = EstimatorWithListInput()
|
||||
est.set_output(transform=dataframe_lib)
|
||||
|
||||
X_out = est.fit(X).transform(X)
|
||||
assert isinstance(X_out, lib.DataFrame)
|
||||
assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", sorted(ADAPTERS_MANAGER.adapters))
|
||||
def test_adapter_class_has_interface(name):
|
||||
"""Check adapters have the correct interface."""
|
||||
assert isinstance(ADAPTERS_MANAGER.adapters[name], ContainerAdapterProtocol)
|
||||
|
||||
|
||||
def test_check_library_installed(monkeypatch):
|
||||
"""Check import error changed."""
|
||||
orig_import_module = importlib.import_module
|
||||
|
||||
def patched_import_module(name):
|
||||
if name == "pandas":
|
||||
raise ImportError()
|
||||
orig_import_module(name, package=None)
|
||||
|
||||
monkeypatch.setattr(importlib, "import_module", patched_import_module)
|
||||
|
||||
msg = "Setting output container to 'pandas' requires"
|
||||
with pytest.raises(ImportError, match=msg):
|
||||
check_library_installed("pandas")
|
||||
|
||||
|
||||
def test_get_adapter_from_container():
|
||||
"""Check the behavior fo `_get_adapter_from_container`."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
|
||||
adapter = _get_adapter_from_container(X)
|
||||
assert adapter.container_lib == "pandas"
|
||||
err_msg = "The container does not have a registered adapter in scikit-learn."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_get_adapter_from_container(X.to_numpy())
|
||||
@@ -0,0 +1,65 @@
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
from sklearn.utils.graph import single_source_shortest_path_length
|
||||
|
||||
|
||||
def floyd_warshall_slow(graph, directed=False):
|
||||
N = graph.shape[0]
|
||||
|
||||
# set nonzero entries to infinity
|
||||
graph[np.where(graph == 0)] = np.inf
|
||||
|
||||
# set diagonal to zero
|
||||
graph.flat[:: N + 1] = 0
|
||||
|
||||
if not directed:
|
||||
graph = np.minimum(graph, graph.T)
|
||||
|
||||
for k in range(N):
|
||||
for i in range(N):
|
||||
for j in range(N):
|
||||
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
|
||||
|
||||
graph[np.where(np.isinf(graph))] = 0
|
||||
|
||||
return graph
|
||||
|
||||
|
||||
def generate_graph(N=20):
|
||||
# sparse grid of distances
|
||||
rng = np.random.RandomState(0)
|
||||
dist_matrix = rng.random_sample((N, N))
|
||||
|
||||
# make symmetric: distances are not direction-dependent
|
||||
dist_matrix = dist_matrix + dist_matrix.T
|
||||
|
||||
# make graph sparse
|
||||
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
|
||||
dist_matrix[i] = 0
|
||||
|
||||
# set diagonal to zero
|
||||
dist_matrix.flat[:: N + 1] = 0
|
||||
|
||||
return dist_matrix
|
||||
|
||||
|
||||
def test_shortest_path():
|
||||
dist_matrix = generate_graph(20)
|
||||
# We compare path length and not costs (-> set distances to 0 or 1)
|
||||
dist_matrix[dist_matrix != 0] = 1
|
||||
|
||||
for directed in (True, False):
|
||||
if not directed:
|
||||
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
|
||||
|
||||
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
|
||||
for i in range(dist_matrix.shape[0]):
|
||||
# Non-reachable nodes have distance 0 in graph_py
|
||||
dist_dict = defaultdict(int)
|
||||
dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
|
||||
|
||||
for j in range(graph_py[i].shape[0]):
|
||||
assert_array_almost_equal(dist_dict[j], graph_py[i, j])
|
||||
@@ -0,0 +1,40 @@
|
||||
from threadpoolctl import threadpool_info
|
||||
|
||||
from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
|
||||
from sklearn.utils._testing import ignore_warnings
|
||||
|
||||
|
||||
def test_get_sys_info():
|
||||
sys_info = _get_sys_info()
|
||||
|
||||
assert "python" in sys_info
|
||||
assert "executable" in sys_info
|
||||
assert "machine" in sys_info
|
||||
|
||||
|
||||
def test_get_deps_info():
|
||||
with ignore_warnings():
|
||||
deps_info = _get_deps_info()
|
||||
|
||||
assert "pip" in deps_info
|
||||
assert "setuptools" in deps_info
|
||||
assert "sklearn" in deps_info
|
||||
assert "numpy" in deps_info
|
||||
assert "scipy" in deps_info
|
||||
assert "Cython" in deps_info
|
||||
assert "pandas" in deps_info
|
||||
assert "matplotlib" in deps_info
|
||||
assert "joblib" in deps_info
|
||||
|
||||
|
||||
def test_show_versions(capsys):
|
||||
with ignore_warnings():
|
||||
show_versions()
|
||||
out, err = capsys.readouterr()
|
||||
|
||||
assert "python" in out
|
||||
assert "numpy" in out
|
||||
|
||||
info = threadpool_info()
|
||||
if info:
|
||||
assert "threadpoolctl info:" in out
|
||||
@@ -0,0 +1,998 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import scipy.sparse as sp
|
||||
from numpy.random import RandomState
|
||||
from numpy.testing import assert_array_almost_equal, assert_array_equal
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
|
||||
from sklearn.utils.sparsefuncs import (
|
||||
_implicit_column_offset,
|
||||
count_nonzero,
|
||||
csc_median_axis_0,
|
||||
incr_mean_variance_axis,
|
||||
inplace_column_scale,
|
||||
inplace_row_scale,
|
||||
inplace_swap_column,
|
||||
inplace_swap_row,
|
||||
mean_variance_axis,
|
||||
min_max_axis,
|
||||
)
|
||||
from sklearn.utils.sparsefuncs_fast import (
|
||||
assign_rows_csr,
|
||||
csr_row_norms,
|
||||
inplace_csr_row_normalize_l1,
|
||||
inplace_csr_row_normalize_l2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_mean_variance_axis0(csc_container, csr_container, lil_container):
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = lil_container(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=0)
|
||||
|
||||
X_csr = csr_container(X_lil)
|
||||
X_csc = csc_container(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_mean_variance_axis0_precision(dtype, sparse_constructor):
|
||||
# Check that there's no big loss of precision when the real variance is
|
||||
# exactly 0. (#19766)
|
||||
rng = np.random.RandomState(0)
|
||||
X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
|
||||
# Add some missing records which should be ignored:
|
||||
missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
|
||||
X[missing_indices, 0] = np.nan
|
||||
X = sparse_constructor(X)
|
||||
|
||||
# Random positive weights:
|
||||
sample_weight = rng.rand(X.shape[0]).astype(dtype)
|
||||
|
||||
_, var = mean_variance_axis(X, weights=sample_weight, axis=0)
|
||||
|
||||
assert var < np.finfo(dtype).eps
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_mean_variance_axis1(csc_container, csr_container, lil_container):
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_lil = lil_container(X)
|
||||
X_lil[1, 0] = 0
|
||||
X[1, 0] = 0
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
mean_variance_axis(X_lil, axis=1)
|
||||
|
||||
X_csr = csr_container(X_lil)
|
||||
X_csc = csc_container(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
X_test = X.astype(input_dtype)
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
|
||||
assert X_means.dtype == output_dtype
|
||||
assert X_vars.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
|
||||
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["Xw", "X", "weights"],
|
||||
[
|
||||
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 0, 1], [0, 1, 1, 1]], [1, 2, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
|
||||
(
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[1.0, 1.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[0, 0], [1, np.nan], [2, 0], [0, 3], [np.nan, np.nan], [np.nan, 2]],
|
||||
[
|
||||
[0, 0, 0],
|
||||
[1, 1, np.nan],
|
||||
[2, 2, 0],
|
||||
[0, 0, 3],
|
||||
[np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, 2],
|
||||
],
|
||||
[2.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[1, 0, 1], [0, 3, 1]],
|
||||
[[1, 0, 0, 0, 1], [0, 3, 3, 3, 1]],
|
||||
np.array([1, 3, 1]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incr_mean_variance_axis_weighted_axis1(
|
||||
Xw, X, weights, sparse_constructor, dtype
|
||||
):
|
||||
axis = 1
|
||||
Xw_sparse = sparse_constructor(Xw).astype(dtype)
|
||||
X_sparse = sparse_constructor(X).astype(dtype)
|
||||
|
||||
last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
|
||||
last_var = np.zeros_like(last_mean, dtype=dtype)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
means0, vars0, n_incr0 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert means_w0.dtype == dtype
|
||||
assert vars_w0.dtype == dtype
|
||||
assert n_incr_w0.dtype == dtype
|
||||
|
||||
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
|
||||
|
||||
assert_array_almost_equal(means0, means_w0)
|
||||
assert_array_almost_equal(means0, means_simple)
|
||||
assert_array_almost_equal(vars0, vars_w0)
|
||||
assert_array_almost_equal(vars0, vars_simple)
|
||||
assert_array_almost_equal(n_incr0, n_incr_w0)
|
||||
|
||||
# check second round for incremental
|
||||
means1, vars1, n_incr1 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=means0,
|
||||
last_var=vars0,
|
||||
last_n=n_incr0,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=means_w0,
|
||||
last_var=vars_w0,
|
||||
last_n=n_incr_w0,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(means1, means_w1)
|
||||
assert_array_almost_equal(vars1, vars_w1)
|
||||
assert_array_almost_equal(n_incr1, n_incr_w1)
|
||||
|
||||
assert means_w1.dtype == dtype
|
||||
assert vars_w1.dtype == dtype
|
||||
assert n_incr_w1.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["Xw", "X", "weights"],
|
||||
[
|
||||
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1], [0, 1, 1]], [1, 2]),
|
||||
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
|
||||
(
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[[0, np.nan, 2], [0, np.nan, np.nan]],
|
||||
[1.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[0, 0, 1, np.nan, 2, 0], [0, 3, np.nan, np.nan, np.nan, 2]],
|
||||
[
|
||||
[0, 0, 1, np.nan, 2, 0],
|
||||
[0, 0, 1, np.nan, 2, 0],
|
||||
[0, 3, np.nan, np.nan, np.nan, 2],
|
||||
],
|
||||
[2.0, 1.0],
|
||||
),
|
||||
(
|
||||
[[1, 0, 1], [0, 0, 1]],
|
||||
[[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
|
||||
np.array([1, 3]),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_incr_mean_variance_axis_weighted_axis0(
|
||||
Xw, X, weights, sparse_constructor, dtype
|
||||
):
|
||||
axis = 0
|
||||
Xw_sparse = sparse_constructor(Xw).astype(dtype)
|
||||
X_sparse = sparse_constructor(X).astype(dtype)
|
||||
|
||||
last_mean = np.zeros(np.size(Xw, 1), dtype=dtype)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
means0, vars0, n_incr0 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert means_w0.dtype == dtype
|
||||
assert vars_w0.dtype == dtype
|
||||
assert n_incr_w0.dtype == dtype
|
||||
|
||||
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
|
||||
|
||||
assert_array_almost_equal(means0, means_w0)
|
||||
assert_array_almost_equal(means0, means_simple)
|
||||
assert_array_almost_equal(vars0, vars_w0)
|
||||
assert_array_almost_equal(vars0, vars_simple)
|
||||
assert_array_almost_equal(n_incr0, n_incr_w0)
|
||||
|
||||
# check second round for incremental
|
||||
means1, vars1, n_incr1 = incr_mean_variance_axis(
|
||||
X=X_sparse,
|
||||
axis=axis,
|
||||
last_mean=means0,
|
||||
last_var=vars0,
|
||||
last_n=n_incr0,
|
||||
weights=None,
|
||||
)
|
||||
|
||||
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
|
||||
X=Xw_sparse,
|
||||
axis=axis,
|
||||
last_mean=means_w0,
|
||||
last_var=vars_w0,
|
||||
last_n=n_incr_w0,
|
||||
weights=weights,
|
||||
)
|
||||
|
||||
assert_array_almost_equal(means1, means_w1)
|
||||
assert_array_almost_equal(vars1, vars_w1)
|
||||
assert_array_almost_equal(n_incr1, n_incr_w1)
|
||||
|
||||
assert means_w1.dtype == dtype
|
||||
assert vars_w1.dtype == dtype
|
||||
assert n_incr_w1.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
|
||||
def test_incr_mean_variance_axis(csc_container, csr_container, lil_container):
|
||||
for axis in [0, 1]:
|
||||
rng = np.random.RandomState(0)
|
||||
n_features = 50
|
||||
n_samples = 10
|
||||
if axis == 0:
|
||||
data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)]
|
||||
else:
|
||||
data_chunks = [rng.randint(0, 2, size=n_samples) for i in range(n_features)]
|
||||
|
||||
# default params for incr_mean_variance
|
||||
last_mean = np.zeros(n_features) if axis == 0 else np.zeros(n_samples)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros_like(last_mean, dtype=np.int64)
|
||||
|
||||
# Test errors
|
||||
X = np.array(data_chunks[0])
|
||||
X = np.atleast_2d(X)
|
||||
X = X.T if axis == 1 else X
|
||||
X_lil = lil_container(X)
|
||||
X_csr = csr_container(X_lil)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(
|
||||
X=axis, axis=last_mean, last_mean=last_var, last_var=last_n
|
||||
)
|
||||
with pytest.raises(TypeError):
|
||||
incr_mean_variance_axis(
|
||||
X_lil, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
|
||||
# Test _incr_mean_and_var with a 1 row input
|
||||
X_means, X_vars = mean_variance_axis(X_csr, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
|
||||
X_csr, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
# X.shape[axis] picks # samples
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
X_csc = csc_container(X_lil)
|
||||
X_means, X_vars = mean_variance_axis(X_csc, axis)
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
# Test _incremental_mean_and_var with whole data
|
||||
X = np.vstack(data_chunks)
|
||||
X = X.T if axis == 1 else X
|
||||
X_lil = lil_container(X)
|
||||
X_csr = csr_container(X_lil)
|
||||
X_csc = csc_container(X_lil)
|
||||
|
||||
expected_dtypes = [
|
||||
(np.float32, np.float32),
|
||||
(np.float64, np.float64),
|
||||
(np.int32, np.float64),
|
||||
(np.int64, np.float64),
|
||||
]
|
||||
|
||||
for input_dtype, output_dtype in expected_dtypes:
|
||||
for X_sparse in (X_csr, X_csc):
|
||||
X_sparse = X_sparse.astype(input_dtype)
|
||||
last_mean = last_mean.astype(output_dtype)
|
||||
last_var = last_var.astype(output_dtype)
|
||||
X_means, X_vars = mean_variance_axis(X_sparse, axis)
|
||||
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
|
||||
X_sparse,
|
||||
axis=axis,
|
||||
last_mean=last_mean,
|
||||
last_var=last_var,
|
||||
last_n=last_n,
|
||||
)
|
||||
assert X_means_incr.dtype == output_dtype
|
||||
assert X_vars_incr.dtype == output_dtype
|
||||
assert_array_almost_equal(X_means, X_means_incr)
|
||||
assert_array_almost_equal(X_vars, X_vars_incr)
|
||||
assert_array_equal(X.shape[axis], n_incr)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor):
|
||||
"""Check that we raise proper error when axis=1 and the dimension mismatch.
|
||||
Non-regression test for:
|
||||
https://github.com/scikit-learn/scikit-learn/pull/18655
|
||||
"""
|
||||
n_samples, n_features = 60, 4
|
||||
rng = np.random.RandomState(42)
|
||||
X = sparse_constructor(rng.rand(n_samples, n_features))
|
||||
|
||||
last_mean = np.zeros(n_features)
|
||||
last_var = np.zeros_like(last_mean)
|
||||
last_n = np.zeros(last_mean.shape, dtype=np.int64)
|
||||
|
||||
kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n)
|
||||
mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs)
|
||||
assert_allclose(np.mean(X.toarray(), axis=0), mean0)
|
||||
assert_allclose(np.var(X.toarray(), axis=0), var0)
|
||||
|
||||
# test ValueError if axis=1 and last_mean.size == n_features
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X, axis=1, **kwargs)
|
||||
|
||||
# test inconsistent shapes of last_mean, last_var, last_n
|
||||
kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n)
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(X, axis=0, **kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"X1, X2",
|
||||
[
|
||||
(
|
||||
sp.random(5, 2, density=0.8, format="csr", random_state=0),
|
||||
sp.random(13, 2, density=0.8, format="csr", random_state=0),
|
||||
),
|
||||
(
|
||||
sp.random(5, 2, density=0.8, format="csr", random_state=0),
|
||||
sp.hstack(
|
||||
[
|
||||
np.full((13, 1), fill_value=np.nan),
|
||||
sp.random(13, 1, density=0.8, random_state=42),
|
||||
],
|
||||
format="csr",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2, csr_container):
|
||||
# non-regression test for:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/16448
|
||||
# check that computing the incremental mean and variance is equivalent to
|
||||
# computing the mean and variance on the stacked dataset.
|
||||
X1 = csr_container(X1)
|
||||
X2 = csr_container(X2)
|
||||
axis = 0
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=updated_mean, last_var=updated_var, last_n=updated_n
|
||||
)
|
||||
X = sp.vstack([X1, X2])
|
||||
assert_allclose(updated_mean, np.nanmean(X.toarray(), axis=axis))
|
||||
assert_allclose(updated_var, np.nanvar(X.toarray(), axis=axis))
|
||||
assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.toarray()), axis=0))
|
||||
|
||||
|
||||
def test_incr_mean_variance_no_new_n():
|
||||
# check the behaviour when we update the variance with an empty matrix
|
||||
axis = 0
|
||||
X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
|
||||
X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
|
||||
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
|
||||
last_n = np.zeros(X1.shape[1], dtype=np.int64)
|
||||
last_mean, last_var, last_n = incr_mean_variance_axis(
|
||||
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
# update statistic with a column which should ignored
|
||||
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
|
||||
X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_allclose(updated_mean, last_mean)
|
||||
assert_allclose(updated_var, last_var)
|
||||
assert_allclose(updated_n, last_n)
|
||||
|
||||
|
||||
def test_incr_mean_variance_n_float():
|
||||
# check the behaviour when last_n is just a number
|
||||
axis = 0
|
||||
X = sp.random(5, 2, density=0.8, random_state=0).tocsr()
|
||||
last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1])
|
||||
last_n = 0
|
||||
_, _, new_n = incr_mean_variance_axis(
|
||||
X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
|
||||
)
|
||||
assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1])
|
||||
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
|
||||
old_means = np.array([535.0, 535.0, 535.0, 535.0])
|
||||
old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
|
||||
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
|
||||
|
||||
X = sparse_constructor(
|
||||
np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
|
||||
)
|
||||
|
||||
X_nan = sparse_constructor(
|
||||
np.array(
|
||||
[
|
||||
[170, np.nan, 170, 170],
|
||||
[np.nan, 170, 430, 430],
|
||||
[430, 430, np.nan, 300],
|
||||
[300, 300, 300, np.nan],
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
# we avoid creating specific data for axis 0 and 1: translating the data is
|
||||
# enough.
|
||||
if axis:
|
||||
X = X.T
|
||||
X_nan = X_nan.T
|
||||
|
||||
# take a copy of the old statistics since they are modified in place.
|
||||
X_means, X_vars, X_sample_count = incr_mean_variance_axis(
|
||||
X,
|
||||
axis=axis,
|
||||
last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(),
|
||||
last_n=old_sample_count.copy(),
|
||||
)
|
||||
X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
|
||||
X_nan,
|
||||
axis=axis,
|
||||
last_mean=old_means.copy(),
|
||||
last_var=old_variances.copy(),
|
||||
last_n=old_sample_count.copy(),
|
||||
)
|
||||
|
||||
assert_allclose(X_nan_means, X_means)
|
||||
assert_allclose(X_nan_vars, X_vars)
|
||||
assert_allclose(X_nan_sample_count, X_sample_count)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_mean_variance_illegal_axis(csr_container):
|
||||
X, _ = make_classification(5, 4, random_state=0)
|
||||
# Sparsify the array a little bit
|
||||
X[0, 0] = 0
|
||||
X[2, 1] = 0
|
||||
X[4, 3] = 0
|
||||
X_csr = csr_container(X)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-3)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
mean_variance_axis(X_csr, axis=-1)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=-3, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=2, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
incr_mean_variance_axis(
|
||||
X_csr, axis=-1, last_mean=None, last_var=None, last_n=None
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_densify_rows(csr_container):
|
||||
for dtype in (np.float32, np.float64):
|
||||
X = csr_container(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=dtype
|
||||
)
|
||||
X_rows = np.array([0, 2, 3], dtype=np.intp)
|
||||
out = np.ones((6, X.shape[1]), dtype=dtype)
|
||||
out_rows = np.array([1, 3, 4], dtype=np.intp)
|
||||
|
||||
expect = np.ones_like(out)
|
||||
expect[out_rows] = X[X_rows, :].toarray()
|
||||
|
||||
assign_rows_csr(X, X_rows, out_rows, out)
|
||||
assert_array_equal(out, expect)
|
||||
|
||||
|
||||
def test_inplace_column_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(200)
|
||||
XA *= scale
|
||||
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale
|
||||
inplace_column_scale(Xc, scale)
|
||||
inplace_column_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
def test_inplace_row_scale():
|
||||
rng = np.random.RandomState(0)
|
||||
X = sp.rand(100, 200, 0.05)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
scale = rng.rand(100)
|
||||
XA *= scale.reshape(-1, 1)
|
||||
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
scale = scale.astype(np.float32)
|
||||
Xr = X.tocsr()
|
||||
Xc = X.tocsc()
|
||||
XA = X.toarray()
|
||||
XA *= scale.reshape(-1, 1)
|
||||
inplace_row_scale(Xc, scale)
|
||||
inplace_row_scale(Xr, scale)
|
||||
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xc.toarray())
|
||||
assert_array_almost_equal(XA, Xr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_column_scale(X.tolil(), scale)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_inplace_swap_row(csc_container, csr_container):
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[0], X[-1] = swap(X[0], X[-1])
|
||||
inplace_swap_row(X_csr, 0, -1)
|
||||
inplace_swap_row(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[2], X[3] = swap(X[2], X[3])
|
||||
inplace_swap_row(X_csr, 2, 3)
|
||||
inplace_swap_row(X_csc, 2, 3)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_row(X_csr.tolil())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_inplace_swap_column(csc_container, csr_container):
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
swap = linalg.get_blas_funcs(("swap",), (X,))
|
||||
swap = swap[0]
|
||||
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
|
||||
inplace_swap_column(X_csr, 0, -1)
|
||||
inplace_swap_column(X_csc, 0, -1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
|
||||
inplace_swap_column(X_csr, 0, 1)
|
||||
inplace_swap_column(X_csc, 0, 1)
|
||||
assert_array_equal(X_csr.toarray(), X_csc.toarray())
|
||||
assert_array_equal(X, X_csc.toarray())
|
||||
assert_array_equal(X, X_csr.toarray())
|
||||
with pytest.raises(TypeError):
|
||||
inplace_swap_column(X_csr.tolil())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
@pytest.mark.parametrize("sparse_format", CSC_CONTAINERS + CSR_CONTAINERS)
|
||||
@pytest.mark.parametrize(
|
||||
"missing_values, min_func, max_func, ignore_nan",
|
||||
[(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)],
|
||||
)
|
||||
@pytest.mark.parametrize("large_indices", [True, False])
|
||||
def test_min_max(
|
||||
dtype,
|
||||
axis,
|
||||
sparse_format,
|
||||
missing_values,
|
||||
min_func,
|
||||
max_func,
|
||||
ignore_nan,
|
||||
large_indices,
|
||||
):
|
||||
X = np.array(
|
||||
[
|
||||
[0, 3, 0],
|
||||
[2, -1, missing_values],
|
||||
[0, 0, 0],
|
||||
[9, missing_values, 7],
|
||||
[4, 0, 5],
|
||||
],
|
||||
dtype=dtype,
|
||||
)
|
||||
X_sparse = sparse_format(X)
|
||||
|
||||
if large_indices:
|
||||
X_sparse.indices = X_sparse.indices.astype("int64")
|
||||
X_sparse.indptr = X_sparse.indptr.astype("int64")
|
||||
|
||||
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)
|
||||
assert_array_equal(mins_sparse, min_func(X, axis=axis))
|
||||
assert_array_equal(maxs_sparse, max_func(X, axis=axis))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_min_max_axis_errors(csc_container, csr_container):
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
with pytest.raises(TypeError):
|
||||
min_max_axis(X_csr.tolil(), axis=0)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csr, axis=2)
|
||||
with pytest.raises(ValueError):
|
||||
min_max_axis(X_csc, axis=-3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_count_nonzero(csc_container, csr_container):
|
||||
X = np.array(
|
||||
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
|
||||
)
|
||||
X_csr = csr_container(X)
|
||||
X_csc = csc_container(X)
|
||||
X_nonzero = X != 0
|
||||
sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
|
||||
X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
|
||||
|
||||
for axis in [0, 1, -1, -2, None]:
|
||||
assert_array_almost_equal(
|
||||
count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)
|
||||
)
|
||||
assert_array_almost_equal(
|
||||
count_nonzero(X_csr, axis=axis, sample_weight=sample_weight),
|
||||
X_nonzero_weighted.sum(axis=axis),
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
count_nonzero(X_csc)
|
||||
with pytest.raises(ValueError):
|
||||
count_nonzero(X_csr, axis=2)
|
||||
|
||||
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
|
||||
assert (
|
||||
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
|
||||
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
|
||||
)
|
||||
|
||||
# Check dtypes with large sparse matrices too
|
||||
# XXX: test fails on 32bit (Windows/Linux)
|
||||
try:
|
||||
X_csr.indices = X_csr.indices.astype(np.int64)
|
||||
X_csr.indptr = X_csr.indptr.astype(np.int64)
|
||||
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
|
||||
assert (
|
||||
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
|
||||
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
|
||||
)
|
||||
except TypeError as e:
|
||||
assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_csc_row_median(csc_container, csr_container):
|
||||
# Test csc_row_median actually calculates the median.
|
||||
|
||||
# Test that it gives the same output when X is dense.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(100, 50)
|
||||
dense_median = np.median(X, axis=0)
|
||||
csc = csc_container(X)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test that it gives the same output when X is sparse
|
||||
X = rng.rand(51, 100)
|
||||
X[X < 0.7] = 0.0
|
||||
ind = rng.randint(0, 50, 10)
|
||||
X[ind] = -X[ind]
|
||||
csc = csc_container(X)
|
||||
dense_median = np.median(X, axis=0)
|
||||
sparse_median = csc_median_axis_0(csc)
|
||||
assert_array_equal(sparse_median, dense_median)
|
||||
|
||||
# Test for toy data.
|
||||
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
|
||||
csc = csc_container(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
|
||||
X = [[0, -2], [-1, -5], [1, -3]]
|
||||
csc = csc_container(X)
|
||||
assert_array_equal(csc_median_axis_0(csc), np.array([0.0, -3]))
|
||||
|
||||
# Test that it raises an Error for non-csc matrices.
|
||||
with pytest.raises(TypeError):
|
||||
csc_median_axis_0(csr_container(X))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inplace_csr_row_normalize",
|
||||
(inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2),
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_inplace_normalize(csr_container, inplace_csr_row_normalize):
|
||||
if csr_container is sp.csr_matrix:
|
||||
ones = np.ones((10, 1))
|
||||
else:
|
||||
ones = np.ones(10)
|
||||
rs = RandomState(10)
|
||||
|
||||
for dtype in (np.float64, np.float32):
|
||||
X = rs.randn(10, 5).astype(dtype)
|
||||
X_csr = csr_container(X)
|
||||
for index_dtype in [np.int32, np.int64]:
|
||||
# csr_matrix will use int32 indices by default,
|
||||
# up-casting those to int64 when necessary
|
||||
if index_dtype is np.int64:
|
||||
X_csr.indptr = X_csr.indptr.astype(index_dtype)
|
||||
X_csr.indices = X_csr.indices.astype(index_dtype)
|
||||
assert X_csr.indices.dtype == index_dtype
|
||||
assert X_csr.indptr.dtype == index_dtype
|
||||
inplace_csr_row_normalize(X_csr)
|
||||
assert X_csr.dtype == dtype
|
||||
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
|
||||
X_csr.data **= 2
|
||||
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_csr_row_norms(dtype):
|
||||
# checks that csr_row_norms returns the same output as
|
||||
# scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
|
||||
X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42)
|
||||
|
||||
scipy_norms = sp.linalg.norm(X, axis=1) ** 2
|
||||
norms = csr_row_norms(X)
|
||||
|
||||
assert norms.dtype == dtype
|
||||
rtol = 1e-6 if dtype == np.float32 else 1e-7
|
||||
assert_allclose(norms, scipy_norms, rtol=rtol)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS)
|
||||
def centered_matrices(request):
|
||||
"""Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray]."""
|
||||
sparse_container = request.param
|
||||
|
||||
random_state = np.random.default_rng(42)
|
||||
|
||||
X_sparse = sparse_container(
|
||||
sp.random(500, 100, density=0.1, format="csr", random_state=random_state)
|
||||
)
|
||||
X_dense = X_sparse.toarray()
|
||||
mu = np.asarray(X_sparse.mean(axis=0)).ravel()
|
||||
|
||||
X_sparse_centered = _implicit_column_offset(X_sparse, mu)
|
||||
X_dense_centered = X_dense - mu
|
||||
|
||||
return X_sparse_centered, X_dense_centered
|
||||
|
||||
|
||||
def test_implicit_center_matmat(global_random_seed, centered_matrices):
|
||||
X_sparse_centered, X_dense_centered = centered_matrices
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
Y = rng.standard_normal((X_dense_centered.shape[1], 50))
|
||||
assert_allclose(X_dense_centered @ Y, X_sparse_centered.matmat(Y))
|
||||
assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y)
|
||||
|
||||
|
||||
def test_implicit_center_matvec(global_random_seed, centered_matrices):
|
||||
X_sparse_centered, X_dense_centered = centered_matrices
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
y = rng.standard_normal(X_dense_centered.shape[1])
|
||||
assert_allclose(X_dense_centered @ y, X_sparse_centered.matvec(y))
|
||||
assert_allclose(X_dense_centered @ y, X_sparse_centered @ y)
|
||||
|
||||
|
||||
def test_implicit_center_rmatmat(global_random_seed, centered_matrices):
|
||||
X_sparse_centered, X_dense_centered = centered_matrices
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
Y = rng.standard_normal((X_dense_centered.shape[0], 50))
|
||||
assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y))
|
||||
assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.T @ Y)
|
||||
|
||||
|
||||
def test_implit_center_rmatvec(global_random_seed, centered_matrices):
|
||||
X_sparse_centered, X_dense_centered = centered_matrices
|
||||
rng = np.random.default_rng(global_random_seed)
|
||||
y = rng.standard_normal(X_dense_centered.shape[0])
|
||||
assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y))
|
||||
assert_allclose(X_dense_centered.T @ y, X_sparse_centered.T @ y)
|
||||
@@ -0,0 +1,98 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
from pytest import approx
|
||||
|
||||
from sklearn.utils.stats import _weighted_percentile
|
||||
|
||||
|
||||
def test_weighted_percentile():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y[:50] = 0
|
||||
y[-51:] = 2
|
||||
y[-1] = 100000
|
||||
y[50] = 1
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 1
|
||||
|
||||
|
||||
def test_weighted_percentile_equal():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(0.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw[-1] = 0.0
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert score == 0
|
||||
|
||||
|
||||
def test_weighted_percentile_zero_weight():
|
||||
y = np.empty(102, dtype=np.float64)
|
||||
y.fill(1.0)
|
||||
sw = np.ones(102, dtype=np.float64)
|
||||
sw.fill(0.0)
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 1.0
|
||||
|
||||
|
||||
def test_weighted_percentile_zero_weight_zero_percentile():
|
||||
y = np.array([0, 1, 2, 3, 4, 5])
|
||||
sw = np.array([0, 0, 1, 1, 1, 0])
|
||||
score = _weighted_percentile(y, sw, 0)
|
||||
assert approx(score) == 2
|
||||
|
||||
score = _weighted_percentile(y, sw, 50)
|
||||
assert approx(score) == 3
|
||||
|
||||
score = _weighted_percentile(y, sw, 100)
|
||||
assert approx(score) == 4
|
||||
|
||||
|
||||
def test_weighted_median_equal_weights():
|
||||
# Checks weighted percentile=0.5 is same as median when weights equal
|
||||
rng = np.random.RandomState(0)
|
||||
# Odd size as _weighted_percentile takes lower weighted percentile
|
||||
x = rng.randint(10, size=11)
|
||||
weights = np.ones(x.shape)
|
||||
|
||||
median = np.median(x)
|
||||
w_median = _weighted_percentile(x, weights)
|
||||
assert median == approx(w_median)
|
||||
|
||||
|
||||
def test_weighted_median_integer_weights():
|
||||
# Checks weighted percentile=0.5 is same as median when manually weight
|
||||
# data
|
||||
rng = np.random.RandomState(0)
|
||||
x = rng.randint(20, size=10)
|
||||
weights = rng.choice(5, size=10)
|
||||
x_manual = np.repeat(x, weights)
|
||||
|
||||
median = np.median(x_manual)
|
||||
w_median = _weighted_percentile(x, weights)
|
||||
|
||||
assert median == approx(w_median)
|
||||
|
||||
|
||||
def test_weighted_percentile_2d():
|
||||
# Check for when array 2D and sample_weight 1D
|
||||
rng = np.random.RandomState(0)
|
||||
x1 = rng.randint(10, size=10)
|
||||
w1 = rng.choice(5, size=10)
|
||||
|
||||
x2 = rng.randint(20, size=10)
|
||||
x_2d = np.vstack((x1, x2)).T
|
||||
|
||||
w_median = _weighted_percentile(x_2d, w1)
|
||||
p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
|
||||
assert_allclose(w_median, p_axis_0)
|
||||
|
||||
# Check when array and sample_weight boht 2D
|
||||
w2 = rng.choice(5, size=10)
|
||||
w_2d = np.vstack((w1, w2)).T
|
||||
|
||||
w_median = _weighted_percentile(x_2d, w_2d)
|
||||
p_axis_0 = [
|
||||
_weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
|
||||
]
|
||||
assert_allclose(w_median, p_axis_0)
|
||||
@@ -0,0 +1,47 @@
|
||||
import pytest
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.utils._tags import (
|
||||
_DEFAULT_TAGS,
|
||||
_safe_tags,
|
||||
)
|
||||
|
||||
|
||||
class NoTagsEstimator:
|
||||
pass
|
||||
|
||||
|
||||
class MoreTagsEstimator:
|
||||
def _more_tags(self):
|
||||
return {"allow_nan": True}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, err_msg",
|
||||
[
|
||||
(BaseEstimator(), "The key xxx is not defined in _get_tags"),
|
||||
(NoTagsEstimator(), "The key xxx is not defined in _DEFAULT_TAGS"),
|
||||
],
|
||||
)
|
||||
def test_safe_tags_error(estimator, err_msg):
|
||||
# Check that safe_tags raises error in ambiguous case.
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
_safe_tags(estimator, key="xxx")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"estimator, key, expected_results",
|
||||
[
|
||||
(NoTagsEstimator(), None, _DEFAULT_TAGS),
|
||||
(NoTagsEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
(MoreTagsEstimator(), None, {**_DEFAULT_TAGS, **{"allow_nan": True}}),
|
||||
(MoreTagsEstimator(), "allow_nan", True),
|
||||
(BaseEstimator(), None, _DEFAULT_TAGS),
|
||||
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
|
||||
],
|
||||
)
|
||||
def test_safe_tags_no_get_tags(estimator, key, expected_results):
|
||||
# check the behaviour of _safe_tags when an estimator does not implement
|
||||
# _get_tags
|
||||
assert _safe_tags(estimator, key=key) == expected_results
|
||||
@@ -0,0 +1,923 @@
|
||||
import atexit
|
||||
import os
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.utils._testing import (
|
||||
TempMemmap,
|
||||
_convert_container,
|
||||
_delete_folder,
|
||||
_get_warnings_filters_info_list,
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_no_warnings,
|
||||
assert_raise_message,
|
||||
assert_raises,
|
||||
assert_raises_regex,
|
||||
assert_run_python_script_without_output,
|
||||
check_docstring_parameters,
|
||||
create_memmap_backed_data,
|
||||
ignore_warnings,
|
||||
raises,
|
||||
set_random_state,
|
||||
turn_warnings_into_errors,
|
||||
)
|
||||
from sklearn.utils.deprecation import deprecated
|
||||
from sklearn.utils.fixes import (
|
||||
_IS_WASM,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
parse_version,
|
||||
sp_version,
|
||||
)
|
||||
from sklearn.utils.metaestimators import available_if
|
||||
|
||||
|
||||
def test_set_random_state():
|
||||
lda = LinearDiscriminantAnalysis()
|
||||
tree = DecisionTreeClassifier()
|
||||
# Linear Discriminant Analysis doesn't have random state: smoke test
|
||||
set_random_state(lda, 3)
|
||||
set_random_state(tree, 3)
|
||||
assert tree.random_state == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize("csr_container", CSC_CONTAINERS)
|
||||
def test_assert_allclose_dense_sparse(csr_container):
|
||||
x = np.arange(9).reshape(3, 3)
|
||||
msg = "Not equal to tolerance "
|
||||
y = csr_container(x)
|
||||
for X in [x, y]:
|
||||
# basic compare
|
||||
with pytest.raises(AssertionError, match=msg):
|
||||
assert_allclose_dense_sparse(X, X * 2)
|
||||
assert_allclose_dense_sparse(X, X)
|
||||
|
||||
with pytest.raises(ValueError, match="Can only compare two sparse"):
|
||||
assert_allclose_dense_sparse(x, y)
|
||||
|
||||
A = sparse.diags(np.ones(5), offsets=0).tocsr()
|
||||
B = csr_container(np.ones((1, 5)))
|
||||
with pytest.raises(AssertionError, match="Arrays are not equal"):
|
||||
assert_allclose_dense_sparse(B, A)
|
||||
|
||||
|
||||
def test_assert_raises_msg():
|
||||
with assert_raises_regex(AssertionError, "Hello world"):
|
||||
with assert_raises(ValueError, msg="Hello world"):
|
||||
pass
|
||||
|
||||
|
||||
def test_assert_raise_message():
|
||||
def _raise_ValueError(message):
|
||||
raise ValueError(message)
|
||||
|
||||
def _no_raise():
|
||||
pass
|
||||
|
||||
assert_raise_message(ValueError, "test", _raise_ValueError, "test")
|
||||
|
||||
assert_raises(
|
||||
AssertionError,
|
||||
assert_raise_message,
|
||||
ValueError,
|
||||
"something else",
|
||||
_raise_ValueError,
|
||||
"test",
|
||||
)
|
||||
|
||||
assert_raises(
|
||||
ValueError,
|
||||
assert_raise_message,
|
||||
TypeError,
|
||||
"something else",
|
||||
_raise_ValueError,
|
||||
"test",
|
||||
)
|
||||
|
||||
assert_raises(AssertionError, assert_raise_message, ValueError, "test", _no_raise)
|
||||
|
||||
# multiple exceptions in a tuple
|
||||
assert_raises(
|
||||
AssertionError,
|
||||
assert_raise_message,
|
||||
(ValueError, AttributeError),
|
||||
"test",
|
||||
_no_raise,
|
||||
)
|
||||
|
||||
|
||||
def test_ignore_warning():
|
||||
# This check that ignore_warning decorator and context manager are working
|
||||
# as expected
|
||||
def _warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
|
||||
def _multiple_warning_function():
|
||||
warnings.warn("deprecation warning", DeprecationWarning)
|
||||
warnings.warn("deprecation warning")
|
||||
|
||||
# Check the function directly
|
||||
assert_no_warnings(ignore_warnings(_warning_function))
|
||||
assert_no_warnings(ignore_warnings(_warning_function, category=DeprecationWarning))
|
||||
with pytest.warns(DeprecationWarning):
|
||||
ignore_warnings(_warning_function, category=UserWarning)()
|
||||
|
||||
with pytest.warns() as record:
|
||||
ignore_warnings(_multiple_warning_function, category=FutureWarning)()
|
||||
assert len(record) == 2
|
||||
assert isinstance(record[0].message, DeprecationWarning)
|
||||
assert isinstance(record[1].message, UserWarning)
|
||||
|
||||
with pytest.warns() as record:
|
||||
ignore_warnings(_multiple_warning_function, category=UserWarning)()
|
||||
assert len(record) == 1
|
||||
assert isinstance(record[0].message, DeprecationWarning)
|
||||
|
||||
assert_no_warnings(
|
||||
ignore_warnings(_warning_function, category=(DeprecationWarning, UserWarning))
|
||||
)
|
||||
|
||||
# Check the decorator
|
||||
@ignore_warnings
|
||||
def decorator_no_warning():
|
||||
_warning_function()
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=(DeprecationWarning, UserWarning))
|
||||
def decorator_no_warning_multiple():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_warning():
|
||||
_warning_function()
|
||||
|
||||
@ignore_warnings(category=DeprecationWarning)
|
||||
def decorator_no_deprecation_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
@ignore_warnings(category=UserWarning)
|
||||
def decorator_no_user_multiple_warning():
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(decorator_no_warning)
|
||||
assert_no_warnings(decorator_no_warning_multiple)
|
||||
assert_no_warnings(decorator_no_deprecation_warning)
|
||||
with pytest.warns(DeprecationWarning):
|
||||
decorator_no_user_warning()
|
||||
with pytest.warns(UserWarning):
|
||||
decorator_no_deprecation_multiple_warning()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
decorator_no_user_multiple_warning()
|
||||
|
||||
# Check the context manager
|
||||
def context_manager_no_warning():
|
||||
with ignore_warnings():
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_warning_multiple():
|
||||
with ignore_warnings(category=(DeprecationWarning, UserWarning)):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_user_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_warning_function()
|
||||
|
||||
def context_manager_no_deprecation_multiple_warning():
|
||||
with ignore_warnings(category=DeprecationWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
def context_manager_no_user_multiple_warning():
|
||||
with ignore_warnings(category=UserWarning):
|
||||
_multiple_warning_function()
|
||||
|
||||
assert_no_warnings(context_manager_no_warning)
|
||||
assert_no_warnings(context_manager_no_warning_multiple)
|
||||
assert_no_warnings(context_manager_no_deprecation_warning)
|
||||
with pytest.warns(DeprecationWarning):
|
||||
context_manager_no_user_warning()
|
||||
with pytest.warns(UserWarning):
|
||||
context_manager_no_deprecation_multiple_warning()
|
||||
with pytest.warns(DeprecationWarning):
|
||||
context_manager_no_user_multiple_warning()
|
||||
|
||||
# Check that passing warning class as first positional argument
|
||||
warning_class = UserWarning
|
||||
match = "'obj' should be a callable.+you should use 'category=UserWarning'"
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
silence_warnings_func = ignore_warnings(warning_class)(_warning_function)
|
||||
silence_warnings_func()
|
||||
|
||||
with pytest.raises(ValueError, match=match):
|
||||
|
||||
@ignore_warnings(warning_class)
|
||||
def test():
|
||||
pass
|
||||
|
||||
|
||||
class TestWarns(unittest.TestCase):
|
||||
def test_warn(self):
|
||||
def f():
|
||||
warnings.warn("yo")
|
||||
return 3
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
assert_no_warnings(f)
|
||||
assert assert_no_warnings(lambda x: x, 1) == 1
|
||||
|
||||
|
||||
# Tests for docstrings:
|
||||
|
||||
|
||||
def f_ok(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_sections(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_bad_order(b, a):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_too_many_param_docstring(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
b : int
|
||||
Parameter b
|
||||
c : int
|
||||
Parameter c
|
||||
|
||||
Returns
|
||||
-------
|
||||
d : list
|
||||
Parameter c
|
||||
"""
|
||||
d = a + b
|
||||
return d
|
||||
|
||||
|
||||
def f_missing(a, b):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : int
|
||||
Parameter a
|
||||
|
||||
Returns
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
c = a + b
|
||||
return c
|
||||
|
||||
|
||||
def f_check_param_definition(a, b, c, d, e):
|
||||
"""Function f
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a: int
|
||||
Parameter a
|
||||
b:
|
||||
Parameter b
|
||||
c :
|
||||
This is parsed correctly in numpydoc 1.2
|
||||
d:int
|
||||
Parameter d
|
||||
e
|
||||
No typespec is allowed without colon
|
||||
"""
|
||||
return a + b + c + d
|
||||
|
||||
|
||||
class Klass:
|
||||
def f_missing(self, X, y):
|
||||
pass
|
||||
|
||||
def f_bad_sections(self, X, y):
|
||||
"""Function f
|
||||
|
||||
Parameter
|
||||
---------
|
||||
a : int
|
||||
Parameter a
|
||||
b : float
|
||||
Parameter b
|
||||
|
||||
Results
|
||||
-------
|
||||
c : list
|
||||
Parameter c
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class MockEst:
|
||||
def __init__(self):
|
||||
"""MockEstimator"""
|
||||
|
||||
def fit(self, X, y):
|
||||
return X
|
||||
|
||||
def predict(self, X):
|
||||
return X
|
||||
|
||||
def predict_proba(self, X):
|
||||
return X
|
||||
|
||||
def score(self, X):
|
||||
return 1.0
|
||||
|
||||
|
||||
class MockMetaEstimator:
|
||||
def __init__(self, delegate):
|
||||
"""MetaEstimator to check if doctest on delegated methods work.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
delegate : estimator
|
||||
Delegated estimator.
|
||||
"""
|
||||
self.delegate = delegate
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "predict"))
|
||||
def predict(self, X):
|
||||
"""This is available only if delegate has predict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
return self.delegate.predict(X)
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "score"))
|
||||
@deprecated("Testing a deprecated delegated method")
|
||||
def score(self, X):
|
||||
"""This is available only if delegate has score.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
y : ndarray
|
||||
Parameter y
|
||||
"""
|
||||
|
||||
@available_if(lambda self: hasattr(self.delegate, "predict_proba"))
|
||||
def predict_proba(self, X):
|
||||
"""This is available only if delegate has predict_proba.
|
||||
|
||||
Parameters
|
||||
---------
|
||||
X : ndarray
|
||||
Parameter X
|
||||
"""
|
||||
return X
|
||||
|
||||
@deprecated("Testing deprecated function with wrong params")
|
||||
def fit(self, X, y):
|
||||
"""Incorrect docstring but should not be tested"""
|
||||
|
||||
|
||||
def test_check_docstring_parameters():
|
||||
pytest.importorskip(
|
||||
"numpydoc",
|
||||
reason="numpydoc is required to test the docstrings",
|
||||
minversion="1.2.0",
|
||||
)
|
||||
|
||||
incorrect = check_docstring_parameters(f_ok)
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_ok, ignore=["b"])
|
||||
assert incorrect == []
|
||||
incorrect = check_docstring_parameters(f_missing, ignore=["b"])
|
||||
assert incorrect == []
|
||||
with pytest.raises(RuntimeError, match="Unknown section Results"):
|
||||
check_docstring_parameters(f_bad_sections)
|
||||
with pytest.raises(RuntimeError, match="Unknown section Parameter"):
|
||||
check_docstring_parameters(Klass.f_bad_sections)
|
||||
|
||||
incorrect = check_docstring_parameters(f_check_param_definition)
|
||||
mock_meta = MockMetaEstimator(delegate=MockEst())
|
||||
mock_meta_name = mock_meta.__class__.__name__
|
||||
assert incorrect == [
|
||||
(
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('a: int')"
|
||||
),
|
||||
(
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('b:')"
|
||||
),
|
||||
(
|
||||
"sklearn.utils.tests.test_testing.f_check_param_definition There "
|
||||
"was no space between the param name and colon ('d:int')"
|
||||
),
|
||||
]
|
||||
|
||||
messages = [
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.f_bad_order",
|
||||
(
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'b' != 'a'"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['b', 'a']",
|
||||
"+ ['a', 'b']",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ "sklearn.utils.tests.test_testing.f_too_many_param_docstring",
|
||||
(
|
||||
"Parameters in function docstring have more items w.r.t. function"
|
||||
" signature, first extra item: c"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a', 'b', 'c']",
|
||||
"? +++++",
|
||||
],
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.f_missing",
|
||||
(
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: b"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['a', 'b']",
|
||||
"+ ['a']",
|
||||
],
|
||||
[
|
||||
"In function: sklearn.utils.tests.test_testing.Klass.f_missing",
|
||||
(
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.predict",
|
||||
(
|
||||
"There's a parameter name mismatch in function docstring w.r.t."
|
||||
" function signature, at index 0 diff: 'X' != 'y'"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['X']",
|
||||
"? ^",
|
||||
"+ ['y']",
|
||||
"? ^",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}."
|
||||
+ "predict_proba",
|
||||
"potentially wrong underline length... ",
|
||||
"Parameters ",
|
||||
"--------- in ",
|
||||
],
|
||||
[
|
||||
"In function: "
|
||||
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.score",
|
||||
"potentially wrong underline length... ",
|
||||
"Parameters ",
|
||||
"--------- in ",
|
||||
],
|
||||
[
|
||||
"In function: " + f"sklearn.utils.tests.test_testing.{mock_meta_name}.fit",
|
||||
(
|
||||
"Parameters in function docstring have less items w.r.t. function"
|
||||
" signature, first missing item: X"
|
||||
),
|
||||
"Full diff:",
|
||||
"- ['X', 'y']",
|
||||
"+ []",
|
||||
],
|
||||
]
|
||||
|
||||
for msg, f in zip(
|
||||
messages,
|
||||
[
|
||||
f_bad_order,
|
||||
f_too_many_param_docstring,
|
||||
f_missing,
|
||||
Klass.f_missing,
|
||||
mock_meta.predict,
|
||||
mock_meta.predict_proba,
|
||||
mock_meta.score,
|
||||
mock_meta.fit,
|
||||
],
|
||||
):
|
||||
incorrect = check_docstring_parameters(f)
|
||||
assert msg == incorrect, '\n"%s"\n not in \n"%s"' % (msg, incorrect)
|
||||
|
||||
|
||||
class RegistrationCounter:
|
||||
def __init__(self):
|
||||
self.nb_calls = 0
|
||||
|
||||
def __call__(self, to_register_func):
|
||||
self.nb_calls += 1
|
||||
assert to_register_func.func is _delete_folder
|
||||
|
||||
|
||||
def check_memmap(input_array, mmap_data, mmap_mode="r"):
|
||||
assert isinstance(mmap_data, np.memmap)
|
||||
writeable = mmap_mode != "r"
|
||||
assert mmap_data.flags.writeable is writeable
|
||||
np.testing.assert_array_equal(input_array, mmap_data)
|
||||
|
||||
|
||||
def test_tempmemmap(monkeypatch):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, "register", registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
with TempMemmap(input_array) as data:
|
||||
check_memmap(input_array, data)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != "nt":
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
mmap_mode = "r+"
|
||||
with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
|
||||
check_memmap(input_array, data, mmap_mode=mmap_mode)
|
||||
temp_folder = os.path.dirname(data.filename)
|
||||
if os.name != "nt":
|
||||
assert not os.path.exists(temp_folder)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
|
||||
@pytest.mark.xfail(_IS_WASM, reason="memmap not fully supported")
|
||||
def test_create_memmap_backed_data(monkeypatch):
|
||||
registration_counter = RegistrationCounter()
|
||||
monkeypatch.setattr(atexit, "register", registration_counter)
|
||||
|
||||
input_array = np.ones(3)
|
||||
data = create_memmap_backed_data(input_array)
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 1
|
||||
|
||||
data, folder = create_memmap_backed_data(input_array, return_folder=True)
|
||||
check_memmap(input_array, data)
|
||||
assert folder == os.path.dirname(data.filename)
|
||||
assert registration_counter.nb_calls == 2
|
||||
|
||||
mmap_mode = "r+"
|
||||
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
|
||||
check_memmap(input_array, data, mmap_mode)
|
||||
assert registration_counter.nb_calls == 3
|
||||
|
||||
input_list = [input_array, input_array + 1, input_array + 2]
|
||||
mmap_data_list = create_memmap_backed_data(input_list)
|
||||
for input_array, data in zip(input_list, mmap_data_list):
|
||||
check_memmap(input_array, data)
|
||||
assert registration_counter.nb_calls == 4
|
||||
|
||||
output_data, other = create_memmap_backed_data([input_array, "not-an-array"])
|
||||
check_memmap(input_array, output_data)
|
||||
assert other == "not-an-array"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name, container_type",
|
||||
[
|
||||
("list", list),
|
||||
("tuple", tuple),
|
||||
("array", np.ndarray),
|
||||
("sparse", sparse.csr_matrix),
|
||||
# using `zip` will only keep the available sparse containers
|
||||
# depending of the installed SciPy version
|
||||
*zip(["sparse_csr", "sparse_csr_array"], CSR_CONTAINERS),
|
||||
*zip(["sparse_csc", "sparse_csc_array"], CSC_CONTAINERS),
|
||||
("dataframe", lambda: pytest.importorskip("pandas").DataFrame),
|
||||
("series", lambda: pytest.importorskip("pandas").Series),
|
||||
("index", lambda: pytest.importorskip("pandas").Index),
|
||||
("slice", slice),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, superdtype",
|
||||
[
|
||||
(np.int32, np.integer),
|
||||
(np.int64, np.integer),
|
||||
(np.float32, np.floating),
|
||||
(np.float64, np.floating),
|
||||
],
|
||||
)
|
||||
def test_convert_container(
|
||||
constructor_name,
|
||||
container_type,
|
||||
dtype,
|
||||
superdtype,
|
||||
):
|
||||
"""Check that we convert the container to the right type of array with the
|
||||
right data type."""
|
||||
if constructor_name in ("dataframe", "polars", "series", "polars_series", "index"):
|
||||
# delay the import of pandas/polars within the function to only skip this test
|
||||
# instead of the whole file
|
||||
container_type = container_type()
|
||||
container = [0, 1]
|
||||
|
||||
container_converted = _convert_container(
|
||||
container,
|
||||
constructor_name,
|
||||
dtype=dtype,
|
||||
)
|
||||
assert isinstance(container_converted, container_type)
|
||||
|
||||
if constructor_name in ("list", "tuple", "index"):
|
||||
# list and tuple will use Python class dtype: int, float
|
||||
# pandas index will always use high precision: np.int64 and np.float64
|
||||
assert np.issubdtype(type(container_converted[0]), superdtype)
|
||||
elif hasattr(container_converted, "dtype"):
|
||||
assert container_converted.dtype == dtype
|
||||
elif hasattr(container_converted, "dtypes"):
|
||||
assert container_converted.dtypes[0] == dtype
|
||||
|
||||
|
||||
def test_convert_container_categories_pandas():
|
||||
pytest.importorskip("pandas")
|
||||
df = _convert_container(
|
||||
[["x"]], "dataframe", ["A"], categorical_feature_names=["A"]
|
||||
)
|
||||
assert df.dtypes.iloc[0] == "category"
|
||||
|
||||
|
||||
def test_convert_container_categories_polars():
|
||||
pl = pytest.importorskip("polars")
|
||||
df = _convert_container([["x"]], "polars", ["A"], categorical_feature_names=["A"])
|
||||
assert df.schema["A"] == pl.Categorical()
|
||||
|
||||
|
||||
def test_convert_container_categories_pyarrow():
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = _convert_container([["x"]], "pyarrow", ["A"], categorical_feature_names=["A"])
|
||||
assert type(df.schema[0].type) is pa.DictionaryType
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
sp_version >= parse_version("1.8"),
|
||||
reason="sparse arrays are available as of scipy 1.8.0",
|
||||
)
|
||||
@pytest.mark.parametrize("constructor_name", ["sparse_csr_array", "sparse_csc_array"])
|
||||
@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
|
||||
def test_convert_container_raise_when_sparray_not_available(constructor_name, dtype):
|
||||
"""Check that if we convert to sparse array but sparse array are not supported
|
||||
(scipy<1.8.0), we should raise an explicit error."""
|
||||
container = [0, 1]
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match=f"only available with scipy>=1.8.0, got {sp_version}",
|
||||
):
|
||||
_convert_container(container, constructor_name, dtype=dtype)
|
||||
|
||||
|
||||
def test_raises():
|
||||
# Tests for the raises context manager
|
||||
|
||||
# Proper type, no match
|
||||
with raises(TypeError):
|
||||
raise TypeError()
|
||||
|
||||
# Proper type, proper match
|
||||
with raises(TypeError, match="how are you") as cm:
|
||||
raise TypeError("hello how are you")
|
||||
assert cm.raised_and_matched
|
||||
|
||||
# Proper type, proper match with multiple patterns
|
||||
with raises(TypeError, match=["not this one", "how are you"]) as cm:
|
||||
raise TypeError("hello how are you")
|
||||
assert cm.raised_and_matched
|
||||
|
||||
# bad type, no match
|
||||
with pytest.raises(ValueError, match="this will be raised"):
|
||||
with raises(TypeError) as cm:
|
||||
raise ValueError("this will be raised")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# Bad type, no match, with a err_msg
|
||||
with pytest.raises(AssertionError, match="the failure message"):
|
||||
with raises(TypeError, err_msg="the failure message") as cm:
|
||||
raise ValueError()
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# bad type, with match (is ignored anyway)
|
||||
with pytest.raises(ValueError, match="this will be raised"):
|
||||
with raises(TypeError, match="this is ignored") as cm:
|
||||
raise ValueError("this will be raised")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# proper type but bad match
|
||||
with pytest.raises(
|
||||
AssertionError, match="should contain one of the following patterns"
|
||||
):
|
||||
with raises(TypeError, match="hello") as cm:
|
||||
raise TypeError("Bad message")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# proper type but bad match, with err_msg
|
||||
with pytest.raises(AssertionError, match="the failure message"):
|
||||
with raises(TypeError, match="hello", err_msg="the failure message") as cm:
|
||||
raise TypeError("Bad message")
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# no raise with default may_pass=False
|
||||
with pytest.raises(AssertionError, match="Did not raise"):
|
||||
with raises(TypeError) as cm:
|
||||
pass
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# no raise with may_pass=True
|
||||
with raises(TypeError, match="hello", may_pass=True) as cm:
|
||||
pass # still OK
|
||||
assert not cm.raised_and_matched
|
||||
|
||||
# Multiple exception types:
|
||||
with raises((TypeError, ValueError)):
|
||||
raise TypeError()
|
||||
with raises((TypeError, ValueError)):
|
||||
raise ValueError()
|
||||
with pytest.raises(AssertionError):
|
||||
with raises((TypeError, ValueError)):
|
||||
pass
|
||||
|
||||
|
||||
def test_float32_aware_assert_allclose():
|
||||
# The relative tolerance for float32 inputs is 1e-4
|
||||
assert_allclose(np.array([1.0 + 2e-5], dtype=np.float32), 1.0)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1.0 + 2e-4], dtype=np.float32), 1.0)
|
||||
|
||||
# The relative tolerance for other inputs is left to 1e-7 as in
|
||||
# the original numpy version.
|
||||
assert_allclose(np.array([1.0 + 2e-8], dtype=np.float64), 1.0)
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1.0 + 2e-7], dtype=np.float64), 1.0)
|
||||
|
||||
# atol is left to 0.0 by default, even for float32
|
||||
with pytest.raises(AssertionError):
|
||||
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0)
|
||||
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0, atol=2e-5)
|
||||
|
||||
|
||||
@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
|
||||
def test_assert_run_python_script_without_output():
|
||||
code = "x = 1"
|
||||
assert_run_python_script_without_output(code)
|
||||
|
||||
code = "print('something to stdout')"
|
||||
with pytest.raises(AssertionError, match="Expected no output"):
|
||||
assert_run_python_script_without_output(code)
|
||||
|
||||
code = "print('something to stdout')"
|
||||
with pytest.raises(
|
||||
AssertionError,
|
||||
match="output was not supposed to match.+got.+something to stdout",
|
||||
):
|
||||
assert_run_python_script_without_output(code, pattern="to.+stdout")
|
||||
|
||||
code = "\n".join(["import sys", "print('something to stderr', file=sys.stderr)"])
|
||||
with pytest.raises(
|
||||
AssertionError,
|
||||
match="output was not supposed to match.+got.+something to stderr",
|
||||
):
|
||||
assert_run_python_script_without_output(code, pattern="to.+stderr")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"constructor_name",
|
||||
[
|
||||
"sparse_csr",
|
||||
"sparse_csc",
|
||||
pytest.param(
|
||||
"sparse_csr_array",
|
||||
marks=pytest.mark.skipif(
|
||||
sp_version < parse_version("1.8"),
|
||||
reason="sparse arrays are available as of scipy 1.8.0",
|
||||
),
|
||||
),
|
||||
pytest.param(
|
||||
"sparse_csc_array",
|
||||
marks=pytest.mark.skipif(
|
||||
sp_version < parse_version("1.8"),
|
||||
reason="sparse arrays are available as of scipy 1.8.0",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_convert_container_sparse_to_sparse(constructor_name):
|
||||
"""Non-regression test to check that we can still convert a sparse container
|
||||
from a given format to another format.
|
||||
"""
|
||||
X_sparse = sparse.random(10, 10, density=0.1, format="csr")
|
||||
_convert_container(X_sparse, constructor_name)
|
||||
|
||||
|
||||
def check_warnings_as_errors(warning_info, warnings_as_errors):
|
||||
if warning_info.action == "error" and warnings_as_errors:
|
||||
with pytest.raises(warning_info.category, match=warning_info.message):
|
||||
warnings.warn(
|
||||
message=warning_info.message,
|
||||
category=warning_info.category,
|
||||
)
|
||||
if warning_info.action == "ignore":
|
||||
with warnings.catch_warnings(record=True) as record:
|
||||
message = warning_info.message
|
||||
# Special treatment when regex is used
|
||||
if "Pyarrow" in message:
|
||||
message = "\nPyarrow will become a required dependency"
|
||||
|
||||
warnings.warn(
|
||||
message=message,
|
||||
category=warning_info.category,
|
||||
)
|
||||
assert len(record) == 0 if warnings_as_errors else 1
|
||||
if record:
|
||||
assert str(record[0].message) == message
|
||||
assert record[0].category == warning_info.category
|
||||
|
||||
|
||||
@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
|
||||
def test_sklearn_warnings_as_errors(warning_info):
|
||||
warnings_as_errors = os.environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0"
|
||||
check_warnings_as_errors(warning_info, warnings_as_errors=warnings_as_errors)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
|
||||
def test_turn_warnings_into_errors(warning_info):
|
||||
with warnings.catch_warnings():
|
||||
turn_warnings_into_errors()
|
||||
check_warnings_as_errors(warning_info, warnings_as_errors=True)
|
||||
@@ -0,0 +1,25 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._typedefs import testing_make_array_from_typed_val
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"type_t, value, expected_dtype",
|
||||
[
|
||||
("float64_t", 1.0, np.float64),
|
||||
("float32_t", 1.0, np.float32),
|
||||
("intp_t", 1, np.intp),
|
||||
("int8_t", 1, np.int8),
|
||||
("int32_t", 1, np.int32),
|
||||
("int64_t", 1, np.int64),
|
||||
("uint8_t", 1, np.uint8),
|
||||
("uint32_t", 1, np.uint32),
|
||||
("uint64_t", 1, np.uint64),
|
||||
],
|
||||
)
|
||||
def test_types(type_t, value, expected_dtype):
|
||||
"""Check that the types defined in _typedefs correspond to the expected
|
||||
numpy dtypes.
|
||||
"""
|
||||
assert testing_make_array_from_typed_val[type_t](value).dtype == expected_dtype
|
||||
@@ -0,0 +1,65 @@
|
||||
import string
|
||||
import timeit
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._user_interface import _message_with_time, _print_elapsed_time
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["source", "message", "is_long"],
|
||||
[
|
||||
("ABC", string.ascii_lowercase, False),
|
||||
("ABCDEF", string.ascii_lowercase, False),
|
||||
("ABC", string.ascii_lowercase * 3, True),
|
||||
("ABC" * 10, string.ascii_lowercase, True),
|
||||
("ABC", string.ascii_lowercase + "\u1048", False),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
["time", "time_str"],
|
||||
[
|
||||
(0.2, " 0.2s"),
|
||||
(20, " 20.0s"),
|
||||
(2000, "33.3min"),
|
||||
(20000, "333.3min"),
|
||||
],
|
||||
)
|
||||
def test_message_with_time(source, message, is_long, time, time_str):
|
||||
out = _message_with_time(source, message, time)
|
||||
if is_long:
|
||||
assert len(out) > 70
|
||||
else:
|
||||
assert len(out) == 70
|
||||
|
||||
assert out.startswith("[" + source + "] ")
|
||||
out = out[len(source) + 3 :]
|
||||
|
||||
assert out.endswith(time_str)
|
||||
out = out[: -len(time_str)]
|
||||
assert out.endswith(", total=")
|
||||
out = out[: -len(", total=")]
|
||||
assert out.endswith(message)
|
||||
out = out[: -len(message)]
|
||||
assert out.endswith(" ")
|
||||
out = out[:-1]
|
||||
|
||||
if is_long:
|
||||
assert not out
|
||||
else:
|
||||
assert list(set(out)) == ["."]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
["message", "expected"],
|
||||
[
|
||||
("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
|
||||
("", _message_with_time("ABC", "", 0.1) + "\n"),
|
||||
(None, ""),
|
||||
],
|
||||
)
|
||||
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0)
|
||||
with _print_elapsed_time("ABC", message):
|
||||
monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
|
||||
assert capsys.readouterr().out == expected
|
||||
@@ -0,0 +1,27 @@
|
||||
import joblib
|
||||
import pytest
|
||||
|
||||
from sklearn.utils import parallel_backend, register_parallel_backend, tosequence
|
||||
|
||||
|
||||
# TODO(1.7): remove
|
||||
def test_is_pypy_deprecated():
|
||||
with pytest.warns(FutureWarning, match="IS_PYPY is deprecated"):
|
||||
from sklearn.utils import IS_PYPY # noqa
|
||||
|
||||
|
||||
# TODO(1.7): remove
|
||||
def test_tosequence_deprecated():
|
||||
with pytest.warns(FutureWarning, match="tosequence was deprecated in 1.5"):
|
||||
tosequence([1, 2, 3])
|
||||
|
||||
|
||||
# TODO(1.7): remove
|
||||
def test_parallel_backend_deprecated():
|
||||
with pytest.warns(FutureWarning, match="parallel_backend is deprecated"):
|
||||
parallel_backend("loky", None)
|
||||
|
||||
with pytest.warns(FutureWarning, match="register_parallel_backend is deprecated"):
|
||||
register_parallel_backend("a_backend", None)
|
||||
|
||||
del joblib.parallel.BACKENDS["a_backend"]
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,25 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._weight_vector import (
|
||||
WeightVector32,
|
||||
WeightVector64,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, WeightVector",
|
||||
[
|
||||
(np.float32, WeightVector32),
|
||||
(np.float64, WeightVector64),
|
||||
],
|
||||
)
|
||||
def test_type_invariance(dtype, WeightVector):
|
||||
"""Check the `dtype` consistency of `WeightVector`."""
|
||||
weights = np.random.rand(100).astype(dtype)
|
||||
average_weights = np.random.rand(100).astype(dtype)
|
||||
|
||||
weight_vector = WeightVector(weights, average_weights)
|
||||
|
||||
assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
|
||||
assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)
|
||||
Reference in New Issue
Block a user