library packages

This commit is contained in:
2024-09-28 22:56:00 -07:00
parent 64d9b78b3a
commit 1973934e95
4893 changed files with 1184173 additions and 31 deletions

View File

@@ -0,0 +1,16 @@
import pytest
from numpy.testing import assert_allclose
from sklearn.utils import check_random_state
from sklearn.utils._arpack import _init_arpack_v0
@pytest.mark.parametrize("seed", range(100))
def test_init_arpack_v0(seed):
# check that the initialization a sampling from an uniform distribution
# where we can fix the random state
size = 1000
v0 = _init_arpack_v0(size, seed)
rng = check_random_state(seed)
assert_allclose(v0, rng.uniform(-1, 1, size=size))

View File

@@ -0,0 +1,580 @@
import re
from functools import partial
import numpy
import pytest
from numpy.testing import assert_allclose
from sklearn._config import config_context
from sklearn.base import BaseEstimator
from sklearn.utils._array_api import (
_ArrayAPIWrapper,
_asarray_with_order,
_atol_for_type,
_average,
_convert_to_numpy,
_count_nonzero,
_estimator_with_converted_arrays,
_is_numpy_namespace,
_nanmax,
_nanmin,
_NumPyAPIWrapper,
_ravel,
device,
get_namespace,
get_namespace_and_device,
indexing_dtype,
supported_float_dtypes,
yield_namespace_device_dtype_combinations,
)
from sklearn.utils._testing import (
_array_api_for_tests,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import _IS_32BIT, CSR_CONTAINERS, np_version, parse_version
@pytest.mark.parametrize("X", [numpy.asarray([1, 2, 3]), [1, 2, 3]])
def test_get_namespace_ndarray_default(X):
"""Check that get_namespace returns NumPy wrapper"""
xp_out, is_array_api_compliant = get_namespace(X)
assert isinstance(xp_out, _NumPyAPIWrapper)
assert not is_array_api_compliant
def test_get_namespace_ndarray_creation_device():
"""Check expected behavior with device and creation functions."""
X = numpy.asarray([1, 2, 3])
xp_out, _ = get_namespace(X)
full_array = xp_out.full(10, fill_value=2.0, device="cpu")
assert_allclose(full_array, [2.0] * 10)
with pytest.raises(ValueError, match="Unsupported device"):
xp_out.zeros(10, device="cuda")
@skip_if_array_api_compat_not_configured
def test_get_namespace_ndarray_with_dispatch():
"""Test get_namespace on NumPy ndarrays."""
array_api_compat = pytest.importorskip("array_api_compat")
X_np = numpy.asarray([[1, 2, 3]])
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_np)
assert is_array_api_compliant
if np_version >= parse_version("2.0.0"):
# NumPy 2.0+ is an array API compliant library.
assert xp_out is numpy
else:
# Older NumPy versions require the compatibility layer.
assert xp_out is array_api_compat.numpy
@skip_if_array_api_compat_not_configured
def test_get_namespace_array_api():
"""Test get_namespace for ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1, 2, 3]])
X_xp = xp.asarray(X_np)
with config_context(array_api_dispatch=True):
xp_out, is_array_api_compliant = get_namespace(X_xp)
assert is_array_api_compliant
with pytest.raises(TypeError):
xp_out, is_array_api_compliant = get_namespace(X_xp, X_np)
class _AdjustableNameAPITestWrapper(_ArrayAPIWrapper):
"""API wrapper that has an adjustable name. Used for testing."""
def __init__(self, array_namespace, name):
super().__init__(array_namespace=array_namespace)
self.__name__ = name
def test_array_api_wrapper_astype():
"""Test _ArrayAPIWrapper for ArrayAPIs that is not NumPy."""
array_api_strict = pytest.importorskip("array_api_strict")
xp_ = _AdjustableNameAPITestWrapper(array_api_strict, "array_api_strict")
xp = _ArrayAPIWrapper(xp_)
X = xp.asarray(([[1, 2, 3], [3, 4, 5]]), dtype=xp.float64)
X_converted = xp.astype(X, xp.float32)
assert X_converted.dtype == xp.float32
X_converted = xp.asarray(X, dtype=xp.float32)
assert X_converted.dtype == xp.float32
@pytest.mark.parametrize("array_api", ["numpy", "array_api_strict"])
def test_asarray_with_order(array_api):
"""Test _asarray_with_order passes along order for NumPy arrays."""
xp = pytest.importorskip(array_api)
X = xp.asarray([1.2, 3.4, 5.1])
X_new = _asarray_with_order(X, order="F", xp=xp)
X_new_np = numpy.asarray(X_new)
assert X_new_np.flags["F_CONTIGUOUS"]
def test_asarray_with_order_ignored():
"""Test _asarray_with_order ignores order for Generic ArrayAPI."""
xp = pytest.importorskip("array_api_strict")
xp_ = _AdjustableNameAPITestWrapper(xp, "array_api_strict")
X = numpy.asarray([[1.2, 3.4, 5.1], [3.4, 5.5, 1.2]], order="C")
X = xp_.asarray(X)
X_new = _asarray_with_order(X, order="F", xp=xp_)
X_new_np = numpy.asarray(X_new)
assert X_new_np.flags["C_CONTIGUOUS"]
assert not X_new_np.flags["F_CONTIGUOUS"]
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize(
"weights, axis, normalize, expected",
[
# normalize = True
(None, None, True, 3.5),
(None, 0, True, [2.5, 3.5, 4.5]),
(None, 1, True, [2, 5]),
([True, False], 0, True, [1, 2, 3]), # boolean weights
([True, True, False], 1, True, [1.5, 4.5]), # boolean weights
([0.4, 0.1], 0, True, [1.6, 2.6, 3.6]),
([0.4, 0.2, 0.2], 1, True, [1.75, 4.75]),
([1, 2], 0, True, [3, 4, 5]),
([1, 1, 2], 1, True, [2.25, 5.25]),
([[1, 2, 3], [1, 2, 3]], 0, True, [2.5, 3.5, 4.5]),
([[1, 2, 1], [2, 2, 2]], 1, True, [2, 5]),
# normalize = False
(None, None, False, 21),
(None, 0, False, [5, 7, 9]),
(None, 1, False, [6, 15]),
([True, False], 0, False, [1, 2, 3]), # boolean weights
([True, True, False], 1, False, [3, 9]), # boolean weights
([0.4, 0.1], 0, False, [0.8, 1.3, 1.8]),
([0.4, 0.2, 0.2], 1, False, [1.4, 3.8]),
([1, 2], 0, False, [9, 12, 15]),
([1, 1, 2], 1, False, [9, 21]),
([[1, 2, 3], [1, 2, 3]], 0, False, [5, 14, 27]),
([[1, 2, 1], [2, 2, 2]], 1, False, [8, 30]),
],
)
def test_average(
array_namespace, device_, dtype_name, weights, axis, normalize, expected
):
xp = _array_api_for_tests(array_namespace, device_)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device_)
if weights is not None:
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device_)
with config_context(array_api_dispatch=True):
result = _average(array_in, axis=axis, weights=weights, normalize=normalize)
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
# https://github.com/numpy/numpy/issues/26850
assert device(array_in) == device(result)
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected, atol=_atol_for_type(dtype_name))
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=False),
)
def test_average_raises_with_wrong_dtype(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([2, 0], dtype=dtype_name) + 1j * numpy.asarray(
[4, 3], dtype=dtype_name
)
complex_type_name = array_in.dtype.name
if not hasattr(xp, complex_type_name):
# This is the case for cupy as of March 2024 for instance.
pytest.skip(f"{array_namespace} does not support {complex_type_name}")
array_in = xp.asarray(array_in, device=device)
err_msg = "Complex floating point values are not supported by average."
with (
config_context(array_api_dispatch=True),
pytest.raises(NotImplementedError, match=err_msg),
):
_average(array_in)
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(include_numpy_namespaces=True),
)
@pytest.mark.parametrize(
"axis, weights, error, error_msg",
(
(
None,
[1, 2],
TypeError,
"Axis must be specified",
),
(
0,
[[1, 2]],
# NumPy 2 raises ValueError, NumPy 1 raises TypeError
(ValueError, TypeError),
"weights", # the message is different for NumPy 1 and 2...
),
(
0,
[1, 2, 3, 4],
ValueError,
"weights",
),
(0, [-1, 1], ZeroDivisionError, "Weights sum to zero, can't be normalized"),
),
)
def test_average_raises_with_invalid_parameters(
array_namespace, device, dtype_name, axis, weights, error, error_msg
):
xp = _array_api_for_tests(array_namespace, device)
array_in = numpy.asarray([[1, 2, 3], [4, 5, 6]], dtype=dtype_name)
array_in = xp.asarray(array_in, device=device)
weights = numpy.asarray(weights, dtype=dtype_name)
weights = xp.asarray(weights, device=device)
with config_context(array_api_dispatch=True), pytest.raises(error, match=error_msg):
_average(array_in, axis=axis, weights=weights)
def test_device_raises_if_no_input():
err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: []."
)
with pytest.raises(ValueError, match=err_msg):
device()
err_msg = re.escape(
"At least one input array expected after filtering with remove_none=True, "
"remove_types=[str]. Got none. Original types: [NoneType, str]."
)
with pytest.raises(ValueError, match=err_msg):
device(None, "name")
def test_device_inspection():
class Device:
def __init__(self, name):
self.name = name
def __eq__(self, device):
return self.name == device.name
def __hash__(self):
raise TypeError("Device object is not hashable")
def __str__(self):
return self.name
class Array:
def __init__(self, device_name):
self.device = Device(device_name)
# Sanity check: ensure our Device mock class is non hashable, to
# accurately account for non-hashable device objects in some array
# libraries, because of which the `device` inspection function should'nt
# make use of hash lookup tables (in particular, not use `set`)
with pytest.raises(TypeError):
hash(Array("device").device)
# Test raise if on different devices
err_msg = "Input arrays use different devices: cpu, mygpu"
with pytest.raises(ValueError, match=err_msg):
device(Array("cpu"), Array("mygpu"))
# Test expected value is returned otherwise
array1 = Array("device")
array2 = Array("device")
assert array1.device == device(array1)
assert array1.device == device(array1, array2)
assert array1.device == device(array1, array1, array2)
# TODO: add cupy and cupy.array_api to the list of libraries once the
# the following upstream issue has been fixed:
# https://github.com/cupy/cupy/issues/8180
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["numpy", "array_api_strict", "torch"])
@pytest.mark.parametrize(
"X,reduction,expected",
[
([1, 2, numpy.nan], _nanmin, 1),
([1, -2, -numpy.nan], _nanmin, -2),
([numpy.inf, numpy.inf], _nanmin, numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=0),
[1.0, 2.0, 3.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmin, axis=1),
[1.0, numpy.nan, 4.0],
),
([1, 2, numpy.nan], _nanmax, 2),
([1, 2, numpy.nan], _nanmax, 2),
([-numpy.inf, -numpy.inf], _nanmax, -numpy.inf),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=0),
[4.0, 5.0, 6.0],
),
(
[[1, 2, 3], [numpy.nan, numpy.nan, numpy.nan], [4, 5, 6.0]],
partial(_nanmax, axis=1),
[3.0, numpy.nan, 6.0],
),
],
)
def test_nan_reductions(library, X, reduction, expected):
"""Check NaN reductions like _nanmin and _nanmax"""
xp = pytest.importorskip(library)
with config_context(array_api_dispatch=True):
result = reduction(xp.asarray(X))
result = _convert_to_numpy(result, xp)
assert_allclose(result, expected)
@pytest.mark.parametrize(
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
)
def test_ravel(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
array_xp = xp.asarray(array, device=_device)
with config_context(array_api_dispatch=True):
result = _ravel(array_xp)
result = _convert_to_numpy(result, xp)
expected = numpy.ravel(array, order="C")
assert_allclose(expected, result)
if _is_numpy_namespace(xp):
assert numpy.asarray(result).flags["C_CONTIGUOUS"]
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize("library", ["cupy", "torch", "cupy.array_api"])
def test_convert_to_numpy_gpu(library): # pragma: nocover
"""Check convert_to_numpy for GPU backed libraries."""
xp = pytest.importorskip(library)
if library == "torch":
if not xp.backends.cuda.is_built():
pytest.skip("test requires cuda")
X_gpu = xp.asarray([1.0, 2.0, 3.0], device="cuda")
else:
X_gpu = xp.asarray([1.0, 2.0, 3.0])
X_cpu = _convert_to_numpy(X_gpu, xp=xp)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
def test_convert_to_numpy_cpu():
"""Check convert_to_numpy for PyTorch CPU arrays."""
torch = pytest.importorskip("torch")
X_torch = torch.asarray([1.0, 2.0, 3.0], device="cpu")
X_cpu = _convert_to_numpy(X_torch, xp=torch)
expected_output = numpy.asarray([1.0, 2.0, 3.0])
assert_allclose(X_cpu, expected_output)
class SimpleEstimator(BaseEstimator):
def fit(self, X, y=None):
self.X_ = X
self.n_features_ = X.shape[0]
return self
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, converter",
[
("torch", lambda array: array.cpu().numpy()),
("array_api_strict", lambda array: numpy.asarray(array)),
("cupy.array_api", lambda array: array._array.get()),
],
)
def test_convert_estimator_to_ndarray(array_namespace, converter):
"""Convert estimator attributes to ndarray."""
xp = pytest.importorskip(array_namespace)
X = xp.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X)
new_est = _estimator_with_converted_arrays(est, converter)
assert isinstance(new_est.X_, numpy.ndarray)
@skip_if_array_api_compat_not_configured
def test_convert_estimator_to_array_api():
"""Convert estimator attributes to ArrayAPI arrays."""
xp = pytest.importorskip("array_api_strict")
X_np = numpy.asarray([[1.3, 4.5]])
est = SimpleEstimator().fit(X_np)
new_est = _estimator_with_converted_arrays(est, lambda array: xp.asarray(array))
assert hasattr(new_est.X_, "__array_namespace__")
def test_reshape_behavior():
"""Check reshape behavior with copy and is strict with non-tuple shape."""
xp = _NumPyAPIWrapper()
X = xp.asarray([[1, 2, 3], [3, 4, 5]])
X_no_copy = xp.reshape(X, (-1,), copy=False)
assert X_no_copy.base is X
X_copy = xp.reshape(X, (6, 1), copy=True)
assert X_copy.base is not X.base
with pytest.raises(TypeError, match="shape must be a tuple"):
xp.reshape(X, -1)
@pytest.mark.parametrize("wrapper", [_ArrayAPIWrapper, _NumPyAPIWrapper])
def test_get_namespace_array_api_isdtype(wrapper):
"""Test isdtype implementation from _ArrayAPIWrapper and _NumPyAPIWrapper."""
if wrapper == _ArrayAPIWrapper:
xp_ = pytest.importorskip("array_api_strict")
xp = _ArrayAPIWrapper(xp_)
else:
xp = _NumPyAPIWrapper()
assert xp.isdtype(xp.float32, xp.float32)
assert xp.isdtype(xp.float32, "real floating")
assert xp.isdtype(xp.float64, "real floating")
assert not xp.isdtype(xp.int32, "real floating")
for dtype in supported_float_dtypes(xp):
assert xp.isdtype(dtype, "real floating")
assert xp.isdtype(xp.bool, "bool")
assert not xp.isdtype(xp.float32, "bool")
assert xp.isdtype(xp.int16, "signed integer")
assert not xp.isdtype(xp.uint32, "signed integer")
assert xp.isdtype(xp.uint16, "unsigned integer")
assert not xp.isdtype(xp.int64, "unsigned integer")
assert xp.isdtype(xp.int64, "numeric")
assert xp.isdtype(xp.float32, "numeric")
assert xp.isdtype(xp.uint32, "numeric")
assert not xp.isdtype(xp.float32, "complex floating")
if wrapper == _NumPyAPIWrapper:
assert not xp.isdtype(xp.int8, "complex floating")
assert xp.isdtype(xp.complex64, "complex floating")
assert xp.isdtype(xp.complex128, "complex floating")
with pytest.raises(ValueError, match="Unrecognized data type"):
assert xp.isdtype(xp.int16, "unknown")
@pytest.mark.parametrize(
"namespace, _device, _dtype", yield_namespace_device_dtype_combinations()
)
def test_indexing_dtype(namespace, _device, _dtype):
xp = _array_api_for_tests(namespace, _device)
if _IS_32BIT:
assert indexing_dtype(xp) == xp.int32
else:
assert indexing_dtype(xp) == xp.int64
def test_get_namespace_and_device():
# Use torch as a library with custom Device objects:
torch = pytest.importorskip("torch")
xp_torch = pytest.importorskip("array_api_compat.torch")
some_torch_tensor = torch.arange(3, device="cpu")
some_numpy_array = numpy.arange(3)
# When dispatch is disabled, get_namespace_and_device should return the
# default NumPy wrapper namespace and no device. Our code will handle such
# inputs via the usual __array__ interface without attempting to dispatch
# via the array API.
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
assert namespace is get_namespace(some_numpy_array)[0]
assert not is_array_api
assert device is None
# Otherwise, expose the torch namespace and device via array API compat
# wrapper.
with config_context(array_api_dispatch=True):
namespace, is_array_api, device = get_namespace_and_device(some_torch_tensor)
assert namespace is xp_torch
assert is_array_api
assert device == some_torch_tensor.device
@pytest.mark.parametrize(
"array_namespace, device_, dtype_name", yield_namespace_device_dtype_combinations()
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("axis", [0, 1, None, -1, -2])
@pytest.mark.parametrize("sample_weight_type", [None, "int", "float"])
def test_count_nonzero(
array_namespace, device_, dtype_name, csr_container, axis, sample_weight_type
):
from sklearn.utils.sparsefuncs import count_nonzero as sparse_count_nonzero
xp = _array_api_for_tests(array_namespace, device_)
array = numpy.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]])
if sample_weight_type == "int":
sample_weight = numpy.asarray([1, 2, 2, 3, 1])
elif sample_weight_type == "float":
sample_weight = numpy.asarray([0.5, 1.5, 0.8, 3.2, 2.4], dtype=dtype_name)
else:
sample_weight = None
expected = sparse_count_nonzero(
csr_container(array), axis=axis, sample_weight=sample_weight
)
array_xp = xp.asarray(array, device=device_)
with config_context(array_api_dispatch=True):
result = _count_nonzero(
array_xp, xp=xp, device=device_, axis=axis, sample_weight=sample_weight
)
assert_allclose(_convert_to_numpy(result, xp=xp), expected)
if np_version < parse_version("2.0.0") or np_version >= parse_version("2.1.0"):
# NumPy 2.0 has a problem with the device attribute of scalar arrays:
# https://github.com/numpy/numpy/issues/26850
assert device(array_xp) == device(result)

View File

@@ -0,0 +1,40 @@
import numpy as np
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn.utils.arrayfuncs import _all_with_any_reduction_axis_1, min_pos
def test_min_pos():
# Check that min_pos returns a positive value and that it's consistent
# between float and double
X = np.random.RandomState(0).randn(100)
min_double = min_pos(X)
min_float = min_pos(X.astype(np.float32))
assert_allclose(min_double, min_float)
assert min_double >= 0
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_min_pos_no_positive(dtype):
# Check that the return value of min_pos is the maximum representable
# value of the input dtype when all input elements are <= 0 (#19328)
X = np.full(100, -1.0).astype(dtype, copy=False)
assert min_pos(X) == np.finfo(dtype).max
@pytest.mark.parametrize(
"dtype", [np.int16, np.int32, np.int64, np.float32, np.float64]
)
@pytest.mark.parametrize("value", [0, 1.5, -1])
def test_all_with_any_reduction_axis_1(dtype, value):
# Check that return value is False when there is no row equal to `value`
X = np.arange(12, dtype=dtype).reshape(3, 4)
assert not _all_with_any_reduction_axis_1(X, value=value)
# Make a row equal to `value`
X[1, :] = value
assert _all_with_any_reduction_axis_1(X, value=value)

View File

@@ -0,0 +1,32 @@
import warnings
import numpy as np
import pytest
from sklearn.utils import Bunch
def test_bunch_attribute_deprecation():
"""Check that bunch raises deprecation message with `__getattr__`."""
bunch = Bunch()
values = np.asarray([1, 2, 3])
msg = (
"Key: 'values', is deprecated in 1.3 and will be "
"removed in 1.5. Please use 'grid_values' instead"
)
bunch._set_deprecated(
values, new_key="grid_values", deprecated_key="values", warning_message=msg
)
with warnings.catch_warnings():
# Does not warn for "grid_values"
warnings.simplefilter("error")
v = bunch["grid_values"]
assert v is values
with pytest.warns(FutureWarning, match=msg):
# Warns for "values"
v = bunch["values"]
assert v is values

View File

@@ -0,0 +1,73 @@
import warnings
from itertools import chain
import pytest
from sklearn import config_context
from sklearn.utils._chunking import gen_even_slices, get_chunk_n_rows
from sklearn.utils._testing import assert_array_equal
def test_gen_even_slices():
# check that gen_even_slices contains all samples
some_range = range(10)
joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)]))
assert_array_equal(some_range, joined_range)
@pytest.mark.parametrize(
("row_bytes", "max_n_rows", "working_memory", "expected"),
[
(1024, None, 1, 1024),
(1024, None, 0.99999999, 1023),
(1023, None, 1, 1025),
(1025, None, 1, 1023),
(1024, None, 2, 2048),
(1024, 7, 1, 7),
(1024 * 1024, None, 1, 1),
],
)
def test_get_chunk_n_rows(row_bytes, max_n_rows, working_memory, expected):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with warnings.catch_warnings():
warnings.simplefilter("error", UserWarning)
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)
def test_get_chunk_n_rows_warns():
"""Check that warning is raised when working_memory is too low."""
row_bytes = 1024 * 1024 + 1
max_n_rows = None
working_memory = 1
expected = 1
warn_msg = (
"Could not adhere to working_memory config. Currently 1MiB, 2MiB required."
)
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(
row_bytes=row_bytes,
max_n_rows=max_n_rows,
working_memory=working_memory,
)
assert actual == expected
assert type(actual) is type(expected)
with config_context(working_memory=working_memory):
with pytest.warns(UserWarning, match=warn_msg):
actual = get_chunk_n_rows(row_bytes=row_bytes, max_n_rows=max_n_rows)
assert actual == expected
assert type(actual) is type(expected)

View File

@@ -0,0 +1,316 @@
import numpy as np
import pytest
from numpy.testing import assert_allclose
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._testing import assert_almost_equal, assert_array_almost_equal
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.utils.fixes import CSC_CONTAINERS
def test_compute_class_weight():
# Test (and demo) compute_class_weight.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
cw = compute_class_weight("balanced", classes=classes, y=y)
# total effect of samples is preserved
class_counts = np.bincount(y)[2:]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert cw[0] < cw[1] < cw[2]
@pytest.mark.parametrize(
"y_type, class_weight, classes, err_msg",
[
(
"numeric",
"balanced",
np.arange(4),
"classes should have valid labels that are in y",
),
# Non-regression for https://github.com/scikit-learn/scikit-learn/issues/8312
(
"numeric",
{"label_not_present": 1.0},
np.arange(4),
r"The classes, \[0, 1, 2, 3\], are not in class_weight",
),
(
"numeric",
"balanced",
np.arange(2),
"classes should include all valid labels",
),
(
"numeric",
{0: 1.0, 1: 2.0},
np.arange(2),
"classes should include all valid labels",
),
(
"string",
{"dogs": 3, "cat": 2},
np.array(["dog", "cat"]),
r"The classes, \['dog'\], are not in class_weight",
),
],
)
def test_compute_class_weight_not_present(y_type, class_weight, classes, err_msg):
# Raise error when y does not contain all class labels
y = (
np.asarray([0, 0, 0, 1, 1, 2])
if y_type == "numeric"
else np.asarray(["dog", "cat", "dog"])
)
print(y)
with pytest.raises(ValueError, match=err_msg):
compute_class_weight(class_weight, classes=classes, y=y)
def test_compute_class_weight_dict():
classes = np.arange(3)
class_weights = {0: 1.0, 1: 2.0, 2: 3.0}
y = np.asarray([0, 0, 1, 2])
cw = compute_class_weight(class_weights, classes=classes, y=y)
# When the user specifies class weights, compute_class_weights should just
# return them.
assert_array_almost_equal(np.asarray([1.0, 2.0, 3.0]), cw)
# When a class weight is specified that isn't in classes, the weight is ignored
class_weights = {0: 1.0, 1: 2.0, 2: 3.0, 4: 1.5}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([1.0, 2.0, 3.0], cw)
class_weights = {-1: 5.0, 0: 4.0, 1: 2.0, 2: 3.0}
cw = compute_class_weight(class_weights, classes=classes, y=y)
assert_allclose([4.0, 2.0, 3.0], cw)
def test_compute_class_weight_invariance():
# Test that results with class_weight="balanced" is invariant wrt
# class imbalance if the number of samples is identical.
# The test uses a balanced two class dataset with 100 datapoints.
# It creates three versions, one where class 1 is duplicated
# resulting in 150 points of class 1 and 50 of class 0,
# one where there are 50 points in class 1 and 150 in class 0,
# and one where there are 100 points of each class (this one is balanced
# again).
# With balancing class weights, all three should give the same model.
X, y = make_blobs(centers=2, random_state=0)
# create dataset where class 1 is duplicated twice
X_1 = np.vstack([X] + [X[y == 1]] * 2)
y_1 = np.hstack([y] + [y[y == 1]] * 2)
# create dataset where class 0 is duplicated twice
X_0 = np.vstack([X] + [X[y == 0]] * 2)
y_0 = np.hstack([y] + [y[y == 0]] * 2)
# duplicate everything
X_ = np.vstack([X] * 2)
y_ = np.hstack([y] * 2)
# results should be identical
logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_compute_class_weight_balanced_negative():
# Test compute_class_weight when labels are negative
# Test with balanced class labels.
classes = np.array([-2, -1, 0])
y = np.asarray([-1, -1, 0, 0, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
assert_array_almost_equal(cw, np.array([1.0, 1.0, 1.0]))
# Test with unbalanced class labels.
y = np.asarray([-1, 0, 0, -2, -2, -2])
cw = compute_class_weight("balanced", classes=classes, y=y)
assert len(cw) == len(classes)
class_counts = np.bincount(y + 2)
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2.0 / 3, 2.0, 1.0])
def test_compute_class_weight_balanced_unordered():
# Test compute_class_weight when classes are unordered
classes = np.array([1, 0, 3])
y = np.asarray([1, 0, 0, 3, 3, 3])
cw = compute_class_weight("balanced", classes=classes, y=y)
class_counts = np.bincount(y)[classes]
assert_almost_equal(np.dot(cw, class_counts), y.shape[0])
assert_array_almost_equal(cw, [2.0, 1.0, 2.0 / 3])
def test_compute_class_weight_default():
# Test for the case where no weight is given for a present class.
# Current behaviour is to assign the unweighted classes a weight of 1.
y = np.asarray([2, 2, 2, 3, 3, 4])
classes = np.unique(y)
classes_len = len(classes)
# Test for non specified weights
cw = compute_class_weight(None, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, np.ones(3))
# Tests for partly specified weights
cw = compute_class_weight({2: 1.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 1.0])
cw = compute_class_weight({2: 1.5, 4: 0.5}, classes=classes, y=y)
assert len(cw) == classes_len
assert_array_almost_equal(cw, [1.5, 1.0, 0.5])
def test_compute_sample_weight():
# Test (and demo) compute_sample_weight.
# Test with balanced classes
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with user-defined weights
sample_weight = compute_sample_weight({1: 2, 2: 1}, y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with unbalanced classes
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y)
expected_balanced = np.array(
[0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 0.7777, 2.3333]
)
assert_array_almost_equal(sample_weight, expected_balanced, decimal=4)
# Test with `None` weights
sample_weight = compute_sample_weight(None, y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output of balanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with multi-output with user-defined weights
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight([{1: 2, 2: 1}, {0: 1, 1: 2}], y)
assert_array_almost_equal(sample_weight, [2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
# Test with multi-output of unbalanced classes
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [3, -1]])
sample_weight = compute_sample_weight("balanced", y)
assert_array_almost_equal(sample_weight, expected_balanced**2, decimal=3)
def test_compute_sample_weight_with_subsample():
# Test compute_sample_weight with subsamples specified.
# Test with balanced classes and all samples present
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with column vector of balanced classes and all samples present
y = np.asarray([[1], [1], [1], [2], [2], [2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
# Test with a subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=range(4))
assert_array_almost_equal(sample_weight, [2.0 / 3, 2.0 / 3, 2.0 / 3, 2.0, 2.0, 2.0])
# Test with a bootstrap subsample
y = np.asarray([1, 1, 1, 2, 2, 2])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
expected_balanced = np.asarray([0.6, 0.6, 0.6, 3.0, 3.0, 3.0])
assert_array_almost_equal(sample_weight, expected_balanced)
# Test with a bootstrap subsample for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
sample_weight = compute_sample_weight("balanced", y, indices=[0, 1, 1, 2, 2, 3])
assert_array_almost_equal(sample_weight, expected_balanced**2)
# Test with a missing class
y = np.asarray([1, 1, 1, 2, 2, 2, 3])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
# Test with a missing class for multi-output
y = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1], [2, 2]])
sample_weight = compute_sample_weight("balanced", y, indices=range(6))
assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
@pytest.mark.parametrize(
"y_type, class_weight, indices, err_msg",
[
(
"single-output",
{1: 2, 2: 1},
range(4),
"The only valid class_weight for subsampling is 'balanced'.",
),
(
"multi-output",
{1: 2, 2: 1},
None,
"For multi-output, class_weight should be a list of dicts, or the string",
),
(
"multi-output",
[{1: 2, 2: 1}],
None,
r"Got 1 element\(s\) while having 2 outputs",
),
],
)
def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
# Test compute_sample_weight raises errors expected.
# Invalid preset string
y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
y = y_single_output if y_type == "single-output" else y_multi_output
with pytest.raises(ValueError, match=err_msg):
compute_sample_weight(class_weight, y, indices=indices)
def test_compute_sample_weight_more_than_32():
# Non-regression smoke test for #12146
y = np.arange(50) # more than 32 distinct classes
indices = np.arange(50) # use subsampling
weight = compute_sample_weight("balanced", y, indices=indices)
assert_array_almost_equal(weight, np.ones(y.shape[0]))
def test_class_weight_does_not_contains_more_classes():
"""Check that class_weight can contain more labels than in y.
Non-regression test for #22413
"""
tree = DecisionTreeClassifier(class_weight={0: 1, 1: 10, 2: 20})
# Does not raise
tree.fit([[0, 0, 1], [1, 0, 1], [1, 2, 0]], [0, 0, 1])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_compute_sample_weight_sparse(csc_container):
"""Check that we can compute weight for sparse `y`."""
y = csc_container(np.asarray([[0], [1], [1]]))
sample_weight = compute_sample_weight("balanced", y)
assert_allclose(sample_weight, [1.5, 0.75, 0.75])

View File

@@ -0,0 +1,234 @@
import numpy as np
import pytest
from sklearn.utils._cython_blas import (
ColMajor,
NoTrans,
RowMajor,
Trans,
_asum_memview,
_axpy_memview,
_copy_memview,
_dot_memview,
_gemm_memview,
_gemv_memview,
_ger_memview,
_nrm2_memview,
_rot_memview,
_rotg_memview,
_scal_memview,
)
from sklearn.utils._testing import assert_allclose
def _numpy_to_cython(dtype):
cython = pytest.importorskip("cython")
if dtype == np.float32:
return cython.float
elif dtype == np.float64:
return cython.double
RTOL = {np.float32: 1e-6, np.float64: 1e-12}
ORDER = {RowMajor: "C", ColMajor: "F"}
def _no_op(x):
return x
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_dot(dtype):
dot = _dot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
expected = x.dot(y)
actual = dot(x, y)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_asum(dtype):
asum = _asum_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.abs(x).sum()
actual = asum(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_axpy(dtype):
axpy = _axpy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x + y
axpy(alpha, x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_nrm2(dtype):
nrm2 = _nrm2_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
expected = np.linalg.norm(x)
actual = nrm2(x)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_copy(dtype):
copy = _copy_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = np.empty_like(x)
expected = x.copy()
copy(x, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_scal(dtype):
scal = _scal_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
alpha = 2.5
expected = alpha * x
scal(alpha, x)
assert_allclose(x, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rotg(dtype):
rotg = _rotg_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
a = dtype(rng.randn())
b = dtype(rng.randn())
c, s = 0.0, 0.0
def expected_rotg(a, b):
roe = a if abs(a) > abs(b) else b
if a == 0 and b == 0:
c, s, r, z = (1, 0, 0, 0)
else:
r = np.sqrt(a**2 + b**2) * (1 if roe >= 0 else -1)
c, s = a / r, b / r
z = s if roe == a else (1 if c == 0 else 1 / c)
return r, z, c, s
expected = expected_rotg(a, b)
actual = rotg(a, b, c, s)
assert_allclose(actual, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_rot(dtype):
rot = _rot_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(10).astype(dtype, copy=False)
c = dtype(rng.randn())
s = dtype(rng.randn())
expected_x = c * x + s * y
expected_y = c * y - s * x
rot(x, y, c, s)
assert_allclose(x, expected_x)
assert_allclose(y, expected_y)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_gemv(dtype, opA, transA, order):
gemv = _gemv_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((20, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(x) + beta * y
gemv(transA, alpha, A, x, beta, y)
assert_allclose(y, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_ger(dtype, order):
ger = _ger_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
x = rng.random_sample(10).astype(dtype, copy=False)
y = rng.random_sample(20).astype(dtype, copy=False)
A = np.asarray(
rng.random_sample((10, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha = 2.5
expected = alpha * np.outer(x, y) + A
ger(alpha, x, y, A)
assert_allclose(A, expected, rtol=RTOL[dtype])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize(
"opB, transB", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize(
"opA, transA", [(_no_op, NoTrans), (np.transpose, Trans)], ids=["NoTrans", "Trans"]
)
@pytest.mark.parametrize("order", [RowMajor, ColMajor], ids=["RowMajor", "ColMajor"])
def test_gemm(dtype, opA, transA, opB, transB, order):
gemm = _gemm_memview[_numpy_to_cython(dtype)]
rng = np.random.RandomState(0)
A = np.asarray(
opA(rng.random_sample((30, 10)).astype(dtype, copy=False)), order=ORDER[order]
)
B = np.asarray(
opB(rng.random_sample((10, 20)).astype(dtype, copy=False)), order=ORDER[order]
)
C = np.asarray(
rng.random_sample((30, 20)).astype(dtype, copy=False), order=ORDER[order]
)
alpha, beta = 2.5, -0.5
expected = alpha * opA(A).dot(opB(B)) + beta * C
gemm(transA, transB, alpha, A, B, beta, C)
assert_allclose(C, expected, rtol=RTOL[dtype])

View File

@@ -0,0 +1,22 @@
import pathlib
import pytest
import sklearn
def test_files_generated_by_templates_are_git_ignored():
"""Check the consistence of the files generated from template files."""
gitignore_file = pathlib.Path(sklearn.__file__).parent.parent / ".gitignore"
if not gitignore_file.exists():
pytest.skip("Tests are not run from the source folder")
base_dir = pathlib.Path(sklearn.__file__).parent
ignored_files = gitignore_file.read_text().split("\n")
ignored_files = [pathlib.Path(line) for line in ignored_files]
for filename in base_dir.glob("**/*.tp"):
filename = filename.relative_to(base_dir.parent)
# From "path/to/template.p??.tp" to "path/to/template.p??"
filename_wo_tempita_suffix = filename.with_suffix("")
assert filename_wo_tempita_suffix in ignored_files

View File

@@ -0,0 +1,88 @@
# Authors: Raghav RV <rvraghav93@gmail.com>
# License: BSD 3 clause
import pickle
import pytest
from sklearn.utils.deprecation import _is_deprecated, deprecated
@deprecated("qwerty")
class MockClass1:
pass
class MockClass2:
@deprecated("mockclass2_method")
def method(self):
pass
@deprecated("n_features_ is deprecated") # type: ignore
@property
def n_features_(self):
"""Number of input features."""
return 10
class MockClass3:
@deprecated()
def __init__(self):
pass
class MockClass4:
pass
class MockClass5(MockClass1):
"""Inherit from deprecated class but does not call super().__init__."""
def __init__(self, a):
self.a = a
@deprecated("a message")
class MockClass6:
"""A deprecated class that overrides __new__."""
def __new__(cls, *args, **kwargs):
assert len(args) > 0
return super().__new__(cls)
@deprecated()
def mock_function():
return 10
def test_deprecated():
with pytest.warns(FutureWarning, match="qwerty"):
MockClass1()
with pytest.warns(FutureWarning, match="mockclass2_method"):
MockClass2().method()
with pytest.warns(FutureWarning, match="deprecated"):
MockClass3()
with pytest.warns(FutureWarning, match="qwerty"):
MockClass5(42)
with pytest.warns(FutureWarning, match="a message"):
MockClass6(42)
with pytest.warns(FutureWarning, match="deprecated"):
val = mock_function()
assert val == 10
def test_is_deprecated():
# Test if _is_deprecated helper identifies wrapping via deprecated
# NOTE it works only for class methods and functions
assert _is_deprecated(MockClass1.__new__)
assert _is_deprecated(MockClass2().method)
assert _is_deprecated(MockClass3.__init__)
assert not _is_deprecated(MockClass4.__init__)
assert _is_deprecated(MockClass5.__new__)
assert _is_deprecated(mock_function)
def test_pickle():
pickle.loads(pickle.dumps(mock_function))

View File

@@ -0,0 +1,274 @@
import pickle
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
@pytest.mark.parametrize(
"values, expected",
[
(np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
(
np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
np.array([1, 2, np.nan], dtype="float32"),
),
(
np.array(["b", "a", "c", "a", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
),
(
np.array(["b", "a", None, "a", None], dtype=object),
np.array(["a", "b", None], dtype=object),
),
(np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
],
ids=["int64", "float32-nan", "object", "object-None", "str"],
)
def test_encode_util(values, expected):
uniques = _unique(values)
assert_array_equal(uniques, expected)
result, encoded = _unique(values, return_inverse=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
result, counts = _unique(values, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(counts, np.array([2, 1, 2]))
result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
assert_array_equal(result, expected)
assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
assert_array_equal(counts, np.array([2, 1, 2]))
def test_encode_with_check_unknown():
# test for the check_unknown parameter of _encode()
uniques = np.array([1, 2, 3])
values = np.array([1, 2, 3, 4])
# Default is True, raise error
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=True)
# dont raise error if False
_encode(values, uniques=uniques, check_unknown=False)
# parameter is ignored for object dtype
uniques = np.array(["a", "b", "c"], dtype=object)
values = np.array(["a", "b", "c", "d"], dtype=object)
with pytest.raises(ValueError, match="y contains previously unseen labels"):
_encode(values, uniques=uniques, check_unknown=False)
def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
diff = _check_unknown(values, uniques)
assert_array_equal(diff, expected_diff)
diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
assert_array_equal(diff, expected_diff)
assert_array_equal(valid_mask, expected_mask)
@pytest.mark.parametrize(
"values, uniques, expected_diff, expected_mask",
[
(np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
(np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
(np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array([2, 1, 4, np.nan]),
np.array([2, 5, 1]),
[4, np.nan],
[True, True, False, False],
),
(
np.array([2, 1, 4, 5]),
np.array([2, 5, 1, np.nan]),
[4],
[True, True, False, True],
),
(
np.array(["a", "b", "c", "d"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
np.array(["d"], dtype=object),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"], dtype=object),
np.array(["a", "c", "b"], dtype=object),
np.array(["d"], dtype=object),
[False, True, True, True],
),
(
np.array(["a", "b", "c", "d"]),
np.array(["a", "b", "c"]),
np.array(["d"]),
[True, True, True, False],
),
(
np.array(["d", "c", "a", "b"]),
np.array(["a", "c", "b"]),
np.array(["d"]),
[False, True, True, True],
),
],
)
def test_check_unknown(values, uniques, expected_diff, expected_mask):
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_check_unknown_missing_values(missing_value, pickle_uniques):
# check for check_unknown with missing values with object dtypes
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b", missing_value], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d"]
expected_mask = [False, True, True, True, True]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
uniques = np.array(["c", "a", "b"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = ["d", missing_value]
expected_mask = [False, True, True, True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
values = np.array(["a", missing_value], dtype=object)
uniques = np.array(["a", "b", "z"], dtype=object)
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
expected_diff = [missing_value]
expected_mask = [True, False]
_assert_check_unknown(values, uniques, expected_diff, expected_mask)
@pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
@pytest.mark.parametrize("pickle_uniques", [True, False])
def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
# check for _unique and _encode with missing values with object dtypes
values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
uniques = _unique(values)
if missing_value is None:
assert_array_equal(uniques, expected_uniques)
else: # missing_value == np.nan
assert_array_equal(uniques[:-1], expected_uniques[:-1])
assert np.isnan(uniques[-1])
if pickle_uniques:
uniques = pickle.loads(pickle.dumps(uniques))
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
def test_unique_util_missing_values_numeric():
# Check missing values in numerical values
values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
expected_inverse = np.array([1, 0, 3, 2, 1, 3])
uniques = _unique(values)
assert_array_equal(uniques, expected_uniques)
uniques, inverse = _unique(values, return_inverse=True)
assert_array_equal(uniques, expected_uniques)
assert_array_equal(inverse, expected_inverse)
encoded = _encode(values, uniques=uniques)
assert_array_equal(encoded, expected_inverse)
def test_unique_util_with_all_missing_values():
# test for all types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
uniques = _unique(values)
assert_array_equal(uniques[:-1], ["a", "c", None])
# last value is nan
assert np.isnan(uniques[-1])
expected_inverse = [3, 0, 1, 1, 2, 3, 2]
_, inverse = _unique(values, return_inverse=True)
assert_array_equal(inverse, expected_inverse)
def test_check_unknown_with_both_missing_values():
# test for both types of missing values for object dtype
values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
assert diff[0] is None
assert np.isnan(diff[1])
diff, valid_mask = _check_unknown(
values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
)
assert diff[0] is None
assert np.isnan(diff[1])
assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
@pytest.mark.parametrize(
"values, uniques, expected_counts",
[
(np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
(
np.array([1] * 10 + [2] * 4 + [3] * 15),
np.array([1, 2, 3, 5]),
[10, 4, 15, 0],
),
(
np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
np.array([2, 3, np.nan]),
[4, 15, 10],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c"],
[16, 4, 20],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", "b", "a"],
[20, 4, 16],
),
(
np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["c", np.nan, "a"],
[20, 4, 16],
),
(
np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
["a", "b", "c", "e"],
[16, 4, 20, 0],
),
],
)
def test_get_counts(values, uniques, expected_counts):
counts = _get_counts(values, uniques)
assert_array_equal(counts, expected_counts)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,566 @@
import html
import locale
import re
import types
from contextlib import closing
from io import StringIO
from unittest.mock import patch
import pytest
from sklearn import config_context
from sklearn.base import BaseEstimator
from sklearn.cluster import AgglomerativeClustering, Birch
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import StackingClassifier, StackingRegressor, VotingClassifier
from sklearn.feature_selection import SelectPercentile
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.impute import SimpleImputer
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._estimator_html_repr import (
_get_css_style,
_get_visual_block,
_HTMLDocumentationLinkMixin,
_write_label_html,
estimator_html_repr,
)
from sklearn.utils.fixes import parse_version
@pytest.mark.parametrize("checked", [True, False])
def test_write_label_html(checked):
# Test checking logic and labeling
name = "LogisticRegression"
tool_tip = "hello-world"
with closing(StringIO()) as out:
_write_label_html(out, name, tool_tip, checked=checked)
html_label = out.getvalue()
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"LogisticRegression"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_label)
assert html_label.startswith('<div class="sk-label-container">')
assert "<pre>hello-world</pre>" in html_label
if checked:
assert "checked>" in html_label
@pytest.mark.parametrize("est", ["passthrough", "drop", None])
def test_get_visual_block_single_str_none(est):
# Test estimators that are represented by strings
est_html_info = _get_visual_block(est)
assert est_html_info.kind == "single"
assert est_html_info.estimators == est
assert est_html_info.names == str(est)
assert est_html_info.name_details == str(est)
def test_get_visual_block_single_estimator():
est = LogisticRegression(C=10.0)
est_html_info = _get_visual_block(est)
assert est_html_info.kind == "single"
assert est_html_info.estimators == est
assert est_html_info.names == est.__class__.__name__
assert est_html_info.name_details == str(est)
def test_get_visual_block_pipeline():
pipe = Pipeline(
[
("imputer", SimpleImputer()),
("do_nothing", "passthrough"),
("do_nothing_more", None),
("classifier", LogisticRegression()),
]
)
est_html_info = _get_visual_block(pipe)
assert est_html_info.kind == "serial"
assert est_html_info.estimators == tuple(step[1] for step in pipe.steps)
assert est_html_info.names == [
"imputer: SimpleImputer",
"do_nothing: passthrough",
"do_nothing_more: passthrough",
"classifier: LogisticRegression",
]
assert est_html_info.name_details == [str(est) for _, est in pipe.steps]
def test_get_visual_block_feature_union():
f_union = FeatureUnion([("pca", PCA()), ("svd", TruncatedSVD())])
est_html_info = _get_visual_block(f_union)
assert est_html_info.kind == "parallel"
assert est_html_info.names == ("pca", "svd")
assert est_html_info.estimators == tuple(
trans[1] for trans in f_union.transformer_list
)
assert est_html_info.name_details == (None, None)
def test_get_visual_block_voting():
clf = VotingClassifier(
[("log_reg", LogisticRegression()), ("mlp", MLPClassifier())]
)
est_html_info = _get_visual_block(clf)
assert est_html_info.kind == "parallel"
assert est_html_info.estimators == tuple(trans[1] for trans in clf.estimators)
assert est_html_info.names == ("log_reg", "mlp")
assert est_html_info.name_details == (None, None)
def test_get_visual_block_column_transformer():
ct = ColumnTransformer(
[("pca", PCA(), ["num1", "num2"]), ("svd", TruncatedSVD, [0, 3])]
)
est_html_info = _get_visual_block(ct)
assert est_html_info.kind == "parallel"
assert est_html_info.estimators == tuple(trans[1] for trans in ct.transformers)
assert est_html_info.names == ("pca", "svd")
assert est_html_info.name_details == (["num1", "num2"], [0, 3])
def test_estimator_html_repr_pipeline():
num_trans = Pipeline(
steps=[("pass", "passthrough"), ("imputer", SimpleImputer(strategy="median"))]
)
cat_trans = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="constant", missing_values="empty")),
("one-hot", OneHotEncoder(drop="first")),
]
)
preprocess = ColumnTransformer(
[
("num", num_trans, ["a", "b", "c", "d", "e"]),
("cat", cat_trans, [0, 1, 2, 3]),
]
)
feat_u = FeatureUnion(
[
("pca", PCA(n_components=1)),
(
"tsvd",
Pipeline(
[
("first", TruncatedSVD(n_components=3)),
("select", SelectPercentile()),
]
),
),
]
)
clf = VotingClassifier(
[
("lr", LogisticRegression(solver="lbfgs", random_state=1)),
("mlp", MLPClassifier(alpha=0.001)),
]
)
pipe = Pipeline(
[("preprocessor", preprocess), ("feat_u", feat_u), ("classifier", clf)]
)
html_output = estimator_html_repr(pipe)
# top level estimators show estimator with changes
assert html.escape(str(pipe)) in html_output
for _, est in pipe.steps:
assert (
'<div class="sk-toggleable__content "><pre>' + html.escape(str(est))
) in html_output
# low level estimators do not show changes
with config_context(print_changed_only=True):
assert html.escape(str(num_trans["pass"])) in html_output
assert "passthrough</label>" in html_output
assert html.escape(str(num_trans["imputer"])) in html_output
for _, _, cols in preprocess.transformers:
assert f"<pre>{html.escape(str(cols))}</pre>" in html_output
# feature union
for name, _ in feat_u.transformer_list:
assert f"<label>{html.escape(name)}</label>" in html_output
pca = feat_u.transformer_list[0][1]
assert f"<pre>{html.escape(str(pca))}</pre>" in html_output
tsvd = feat_u.transformer_list[1][1]
first = tsvd["first"]
select = tsvd["select"]
assert f"<pre>{html.escape(str(first))}</pre>" in html_output
assert f"<pre>{html.escape(str(select))}</pre>" in html_output
# voting classifier
for name, est in clf.estimators:
assert f"<label>{html.escape(name)}</label>" in html_output
assert f"<pre>{html.escape(str(est))}</pre>" in html_output
# verify that prefers-color-scheme is implemented
assert "prefers-color-scheme" in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVC()])
def test_stacking_classifier(final_estimator):
estimators = [
("mlp", MLPClassifier(alpha=0.001)),
("tree", DecisionTreeClassifier()),
]
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
html_output = estimator_html_repr(clf)
assert html.escape(str(clf)) in html_output
# If final_estimator's default changes from LogisticRegression
# this should be updated
if final_estimator is None:
assert "LogisticRegression(" in html_output
else:
assert final_estimator.__class__.__name__ in html_output
@pytest.mark.parametrize("final_estimator", [None, LinearSVR()])
def test_stacking_regressor(final_estimator):
reg = StackingRegressor(
estimators=[("svr", LinearSVR())], final_estimator=final_estimator
)
html_output = estimator_html_repr(reg)
assert html.escape(str(reg.estimators[0][0])) in html_output
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"&nbsp;LinearSVR"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
if final_estimator is None:
p = (
r'<label for="sk-estimator-id-[0-9]*"'
r' class="sk-toggleable__label (fitted)? sk-toggleable__label-arrow ">'
r"&nbsp;RidgeCV"
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
else:
assert html.escape(final_estimator.__class__.__name__) in html_output
def test_birch_duck_typing_meta():
# Test duck typing meta estimators with Birch
birch = Birch(n_clusters=AgglomerativeClustering(n_clusters=3))
html_output = estimator_html_repr(birch)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{html.escape(str(birch.n_clusters))}" in html_output
assert "AgglomerativeClustering</label>" in html_output
# outer estimator contains all changes
assert f"<pre>{html.escape(str(birch))}" in html_output
def test_ovo_classifier_duck_typing_meta():
# Test duck typing metaestimators with OVO
ovo = OneVsOneClassifier(LinearSVC(penalty="l1"))
html_output = estimator_html_repr(ovo)
# inner estimators do not show changes
with config_context(print_changed_only=True):
assert f"<pre>{html.escape(str(ovo.estimator))}" in html_output
# regex to match the start of the tag
p = (
r'<label for="sk-estimator-id-[0-9]*" '
r'class="sk-toggleable__label sk-toggleable__label-arrow ">&nbsp;LinearSVC'
)
re_compiled = re.compile(p)
assert re_compiled.search(html_output)
# outer estimator
assert f"<pre>{html.escape(str(ovo))}" in html_output
def test_duck_typing_nested_estimator():
# Test duck typing metaestimators with random search
kernel_ridge = KernelRidge(kernel=ExpSineSquared())
param_distributions = {"alpha": [1, 2]}
kernel_ridge_tuned = RandomizedSearchCV(
kernel_ridge,
param_distributions=param_distributions,
)
html_output = estimator_html_repr(kernel_ridge_tuned)
assert "estimator: KernelRidge</label>" in html_output
@pytest.mark.parametrize("print_changed_only", [True, False])
def test_one_estimator_print_change_only(print_changed_only):
pca = PCA(n_components=10)
with config_context(print_changed_only=print_changed_only):
pca_repr = html.escape(str(pca))
html_output = estimator_html_repr(pca)
assert pca_repr in html_output
def test_fallback_exists():
"""Check that repr fallback is in the HTML."""
pca = PCA(n_components=10)
html_output = estimator_html_repr(pca)
assert (
f'<div class="sk-text-repr-fallback"><pre>{html.escape(str(pca))}'
in html_output
)
def test_show_arrow_pipeline():
"""Show arrow in pipeline for top level in pipeline"""
pipe = Pipeline([("scale", StandardScaler()), ("log_Reg", LogisticRegression())])
html_output = estimator_html_repr(pipe)
assert (
'class="sk-toggleable__label sk-toggleable__label-arrow ">&nbsp;&nbsp;Pipeline'
in html_output
)
def test_invalid_parameters_in_stacking():
"""Invalidate stacking configuration uses default repr.
Non-regression test for #24009.
"""
stacker = StackingClassifier(estimators=[])
html_output = estimator_html_repr(stacker)
assert html.escape(str(stacker)) in html_output
def test_estimator_get_params_return_cls():
"""Check HTML repr works where a value in get_params is a class."""
class MyEstimator:
def get_params(self, deep=False):
return {"inner_cls": LogisticRegression}
est = MyEstimator()
assert "MyEstimator" in estimator_html_repr(est)
def test_estimator_html_repr_unfitted_vs_fitted():
"""Check that we have the information that the estimator is fitted or not in the
HTML representation.
"""
class MyEstimator(BaseEstimator):
def fit(self, X, y):
self.fitted_ = True
return self
X, y = load_iris(return_X_y=True)
estimator = MyEstimator()
assert "<span>Not fitted</span>" in estimator_html_repr(estimator)
estimator.fit(X, y)
assert "<span>Fitted</span>" in estimator_html_repr(estimator)
@pytest.mark.parametrize(
"estimator",
[
LogisticRegression(),
make_pipeline(StandardScaler(), LogisticRegression()),
make_pipeline(
make_column_transformer((StandardScaler(), slice(0, 3))),
LogisticRegression(),
),
],
)
def test_estimator_html_repr_fitted_icon(estimator):
"""Check that we are showing the fitted status icon only once."""
pattern = '<span class="sk-estimator-doc-link ">i<span>Not fitted</span></span>'
assert estimator_html_repr(estimator).count(pattern) == 1
X, y = load_iris(return_X_y=True)
estimator.fit(X, y)
pattern = '<span class="sk-estimator-doc-link fitted">i<span>Fitted</span></span>'
assert estimator_html_repr(estimator).count(pattern) == 1
@pytest.mark.parametrize("mock_version", ["1.3.0.dev0", "1.3.0"])
def test_html_documentation_link_mixin_sklearn(mock_version):
"""Check the behaviour of the `_HTMLDocumentationLinkMixin` class for scikit-learn
default.
"""
# mock the `__version__` where the mixin is located
with patch("sklearn.utils._estimator_html_repr.__version__", mock_version):
mixin = _HTMLDocumentationLinkMixin()
assert mixin._doc_link_module == "sklearn"
sklearn_version = parse_version(mock_version)
# we need to parse the version manually to be sure that this test is passing in
# other branches than `main` (that is "dev").
if sklearn_version.dev is None:
version = f"{sklearn_version.major}.{sklearn_version.minor}"
else:
version = "dev"
assert (
mixin._doc_link_template
== f"https://scikit-learn.org/{version}/modules/generated/"
"{estimator_module}.{estimator_name}.html"
)
assert (
mixin._get_doc_link()
== f"https://scikit-learn.org/{version}/modules/generated/"
"sklearn.utils._HTMLDocumentationLinkMixin.html"
)
@pytest.mark.parametrize(
"module_path,expected_module",
[
("prefix.mymodule", "prefix.mymodule"),
("prefix._mymodule", "prefix"),
("prefix.mypackage._mymodule", "prefix.mypackage"),
("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
],
)
def test_html_documentation_link_mixin_get_doc_link_instance(
module_path, expected_module
):
"""Check the behaviour of the `_get_doc_link` with various parameter."""
class FooBar(_HTMLDocumentationLinkMixin):
pass
FooBar.__module__ = module_path
est = FooBar()
# if we set `_doc_link`, then we expect to infer a module and name for the estimator
est._doc_link_module = "prefix"
est._doc_link_template = (
"https://website.com/{estimator_module}.{estimator_name}.html"
)
assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
@pytest.mark.parametrize(
"module_path,expected_module",
[
("prefix.mymodule", "prefix.mymodule"),
("prefix._mymodule", "prefix"),
("prefix.mypackage._mymodule", "prefix.mypackage"),
("prefix.mypackage._mymodule.submodule", "prefix.mypackage"),
("prefix.mypackage.mymodule.submodule", "prefix.mypackage.mymodule.submodule"),
],
)
def test_html_documentation_link_mixin_get_doc_link_class(module_path, expected_module):
"""Check the behaviour of the `_get_doc_link` when `_doc_link_module` and
`_doc_link_template` are defined at the class level and not at the instance
level."""
class FooBar(_HTMLDocumentationLinkMixin):
_doc_link_module = "prefix"
_doc_link_template = (
"https://website.com/{estimator_module}.{estimator_name}.html"
)
FooBar.__module__ = module_path
est = FooBar()
assert est._get_doc_link() == f"https://website.com/{expected_module}.FooBar.html"
def test_html_documentation_link_mixin_get_doc_link_out_of_library():
"""Check the behaviour of the `_get_doc_link` with various parameter."""
mixin = _HTMLDocumentationLinkMixin()
# if the `_doc_link_module` does not refer to the root module of the estimator
# (here the mixin), then we should return an empty string.
mixin._doc_link_module = "xxx"
assert mixin._get_doc_link() == ""
def test_html_documentation_link_mixin_doc_link_url_param_generator_instance():
mixin = _HTMLDocumentationLinkMixin()
# we can bypass the generation by providing our own callable
mixin._doc_link_template = (
"https://website.com/{my_own_variable}.{another_variable}.html"
)
def url_param_generator(estimator):
return {
"my_own_variable": "value_1",
"another_variable": "value_2",
}
mixin._doc_link_url_param_generator = types.MethodType(url_param_generator, mixin)
assert mixin._get_doc_link() == "https://website.com/value_1.value_2.html"
def test_html_documentation_link_mixin_doc_link_url_param_generator_class():
# we can bypass the generation by providing our own callable
def url_param_generator(estimator):
return {
"my_own_variable": "value_1",
"another_variable": "value_2",
}
class FooBar(_HTMLDocumentationLinkMixin):
_doc_link_template = (
"https://website.com/{my_own_variable}.{another_variable}.html"
)
_doc_link_url_param_generator = url_param_generator
estimator = FooBar()
assert estimator._get_doc_link() == "https://website.com/value_1.value_2.html"
@pytest.fixture
def set_non_utf8_locale():
"""Pytest fixture to set non utf-8 locale during the test.
The locale is set to the original one after the test has run.
"""
try:
locale.setlocale(locale.LC_CTYPE, "C")
except locale.Error:
pytest.skip("'C' locale is not available on this OS")
yield
# Resets the locale to the original one. Python calls setlocale(LC_TYPE, "")
# at startup according to
# https://docs.python.org/3/library/locale.html#background-details-hints-tips-and-caveats.
# This assumes that no other locale changes have been made. For some reason,
# on some platforms, trying to restore locale with something like
# locale.setlocale(locale.LC_CTYPE, locale.getlocale()) raises a
# locale.Error: unsupported locale setting
locale.setlocale(locale.LC_CTYPE, "")
def test_non_utf8_locale(set_non_utf8_locale):
"""Checks that utf8 encoding is used when reading the CSS file.
Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/27725
"""
_get_css_style()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,47 @@
"""Test fast_dict."""
import numpy as np
from numpy.testing import assert_allclose, assert_array_equal
from sklearn.utils._fast_dict import IntFloatDict, argmin
def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
values = rng.rand(len(keys))
d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value
assert len(d) == len(keys)
d.append(120, 3.0)
assert d[120] == 3.0
assert len(d) == len(keys) + 1
for i in range(2000):
d.append(i + 1000, 4.0)
assert d[1100] == 4.0
def test_int_float_dict_argmin():
# Test the argmin implementation on the IntFloatDict
keys = np.arange(100, dtype=np.intp)
values = np.arange(100, dtype=np.float64)
d = IntFloatDict(keys, values)
assert argmin(d) == (0, 0)
def test_to_arrays():
# Test that an IntFloatDict is converted into arrays
# of keys and values correctly
keys_in = np.array([1, 2, 3], dtype=np.intp)
values_in = np.array([4, 5, 6], dtype=np.float64)
d = IntFloatDict(keys_in, values_in)
keys_out, values_out = d.to_arrays()
assert keys_out.dtype == keys_in.dtype
assert values_in.dtype == values_out.dtype
assert_array_equal(keys_out, keys_in)
assert_allclose(values_out, values_in)

View File

@@ -0,0 +1,162 @@
# Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
# Justin Vincent
# Lars Buitinck
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn.utils._testing import assert_array_equal
from sklearn.utils.fixes import _object_dtype_isnan, _smallest_admissible_index_dtype
@pytest.mark.parametrize("dtype, val", ([object, 1], [object, "a"], [float, 1]))
def test_object_dtype_isnan(dtype, val):
X = np.array([[val, np.nan], [np.nan, val]], dtype=dtype)
expected_mask = np.array([[False, True], [True, False]])
mask = _object_dtype_isnan(X)
assert_array_equal(mask, expected_mask)
@pytest.mark.parametrize(
"params, expected_dtype",
[
({}, np.int32), # default behaviour
({"maxval": np.iinfo(np.int32).max}, np.int32),
({"maxval": np.iinfo(np.int32).max + 1}, np.int64),
],
)
def test_smallest_admissible_index_dtype_max_val(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` depending only on the
`max_val` parameter.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# Arrays dtype is int64 and thus should not be downcasted to int32 without
# checking the content of providing maxval.
({"arrays": np.array([1, 2], dtype=np.int64)}, np.int64),
# One of the array is int64 and should not be downcasted to int32
# for the same reasons.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int64),
)
},
np.int64,
),
# Both arrays are already int32: we can just keep this dtype.
(
{
"arrays": (
np.array([1, 2], dtype=np.int32),
np.array([1, 2], dtype=np.int32),
)
},
np.int32,
),
# Arrays should be upcasted to at least int32 precision.
({"arrays": np.array([1, 2], dtype=np.int8)}, np.int32),
# Check that `maxval` takes precedence over the arrays and thus upcast to
# int64.
(
{
"arrays": np.array([1, 2], dtype=np.int32),
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_without_checking_contents(
params, expected_dtype
):
"""Check the behaviour of `smallest_admissible_index_dtype` using the passed
arrays but without checking the contents of the arrays.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, expected_dtype",
[
# empty arrays should always be converted to int32 indices
(
{
"arrays": (np.array([], dtype=np.int64), np.array([], dtype=np.int64)),
"check_contents": True,
},
np.int32,
),
# arrays respecting np.iinfo(np.int32).min < x < np.iinfo(np.int32).max should
# be converted to int32,
(
{"arrays": np.array([1], dtype=np.int64), "check_contents": True},
np.int32,
),
# otherwise, it should be converted to int64. We need to create a uint32
# arrays to accommodate a value > np.iinfo(np.int32).max
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
},
np.int64,
),
# maxval should take precedence over the arrays contents and thus upcast to
# int64.
(
{
"arrays": np.array([1], dtype=np.int32),
"check_contents": True,
"maxval": np.iinfo(np.int32).max + 1,
},
np.int64,
),
# when maxval is small, but check_contents is True and the contents
# require np.int64, we still require np.int64 indexing in the end.
(
{
"arrays": np.array([np.iinfo(np.int32).max + 1], dtype=np.uint32),
"check_contents": True,
"maxval": 1,
},
np.int64,
),
],
)
def test_smallest_admissible_index_dtype_by_checking_contents(params, expected_dtype):
"""Check the behaviour of `smallest_admissible_index_dtype` using the dtype of the
arrays but as well the contents.
"""
assert _smallest_admissible_index_dtype(**params) == expected_dtype
@pytest.mark.parametrize(
"params, err_type, err_msg",
[
(
{"maxval": np.iinfo(np.int64).max + 1},
ValueError,
"is to large to be represented as np.int64",
),
(
{"arrays": np.array([1, 2], dtype=np.float64)},
ValueError,
"Array dtype float64 is not supported",
),
({"arrays": [1, 2]}, TypeError, "Arrays should be of type np.ndarray"),
],
)
def test_smallest_admissible_index_dtype_error(params, err_type, err_msg):
"""Check that we raise the proper error message."""
with pytest.raises(err_type, match=err_msg):
_smallest_admissible_index_dtype(**params)

View File

@@ -0,0 +1,80 @@
import numpy as np
import pytest
from scipy.sparse.csgraph import connected_components
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.utils.graph import _fix_connected_components
def test_fix_connected_components():
# Test that _fix_connected_components reduces the number of component to 1.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
graph = _fix_connected_components(X, graph, n_connected_components, labels)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
def test_fix_connected_components_precomputed():
# Test that _fix_connected_components accepts precomputed distance matrix.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
assert n_connected_components > 1
distances = pairwise_distances(X)
graph = _fix_connected_components(
distances, graph, n_connected_components, labels, metric="precomputed"
)
n_connected_components, labels = connected_components(graph)
assert n_connected_components == 1
# but it does not work with precomputed neighbors graph
with pytest.raises(RuntimeError, match="does not work with a sparse"):
_fix_connected_components(
graph, graph, n_connected_components, labels, metric="precomputed"
)
def test_fix_connected_components_wrong_mode():
# Test that the an error is raised if the mode string is incorrect.
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=2, mode="distance")
n_connected_components, labels = connected_components(graph)
with pytest.raises(ValueError, match="Unknown mode"):
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="foo"
)
def test_fix_connected_components_connectivity_mode():
# Test that the connectivity mode fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="connectivity")
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="connectivity"
)
assert np.all(graph.data == 1)
def test_fix_connected_components_distance_mode():
# Test that the distance mode does not fill new connections with ones.
X = np.array([0, 1, 6, 7])[:, None]
graph = kneighbors_graph(X, n_neighbors=1, mode="distance")
assert np.all(graph.data == 1)
n_connected_components, labels = connected_components(graph)
graph = _fix_connected_components(
X, graph, n_connected_components, labels, mode="distance"
)
assert not np.all(graph.data == 1)

View File

@@ -0,0 +1,594 @@
import warnings
from copy import copy
from unittest import SkipTest
import numpy as np
import pytest
import sklearn
from sklearn.externals._packaging.version import parse as parse_version
from sklearn.utils import _safe_indexing, resample, shuffle
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
from sklearn.utils._indexing import (
_determine_key_type,
_get_column_indices,
_safe_assign,
)
from sklearn.utils._mocking import MockDataFrame
from sklearn.utils._testing import (
_array_api_for_tests,
_convert_container,
assert_allclose_dense_sparse,
assert_array_equal,
skip_if_array_api_compat_not_configured,
)
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS
# toy array
X_toy = np.arange(9).reshape((3, 3))
def test_polars_indexing():
"""Check _safe_indexing for polars as expected."""
pl = pytest.importorskip("polars", minversion="0.18.2")
df = pl.DataFrame(
{"a": [1, 2, 3, 4], "b": [4, 5, 6, 8], "c": [1, 4, 1, 10]}, orient="row"
)
from polars.testing import assert_frame_equal
str_keys = [["b"], ["a", "b"], ["b", "a", "c"], ["c"], ["a"]]
for key in str_keys:
out = _safe_indexing(df, key, axis=1)
assert_frame_equal(df[key], out)
bool_keys = [([True, False, True], ["a", "c"]), ([False, False, True], ["c"])]
for bool_key, str_key in bool_keys:
out = _safe_indexing(df, bool_key, axis=1)
assert_frame_equal(df[:, str_key], out)
int_keys = [([0, 1], ["a", "b"]), ([2], ["c"])]
for int_key, str_key in int_keys:
out = _safe_indexing(df, int_key, axis=1)
assert_frame_equal(df[:, str_key], out)
axis_0_keys = [[0, 1], [1, 3], [3, 2]]
for key in axis_0_keys:
out = _safe_indexing(df, key, axis=0)
assert_frame_equal(df[key], out)
@pytest.mark.parametrize(
"key, dtype",
[
(0, "int"),
("0", "str"),
(True, "bool"),
(np.bool_(True), "bool"),
([0, 1, 2], "int"),
(["0", "1", "2"], "str"),
((0, 1, 2), "int"),
(("0", "1", "2"), "str"),
(slice(None, None), None),
(slice(0, 2), "int"),
(np.array([0, 1, 2], dtype=np.int32), "int"),
(np.array([0, 1, 2], dtype=np.int64), "int"),
(np.array([0, 1, 2], dtype=np.uint8), "int"),
([True, False], "bool"),
((True, False), "bool"),
(np.array([True, False]), "bool"),
("col_0", "str"),
(["col_0", "col_1", "col_2"], "str"),
(("col_0", "col_1", "col_2"), "str"),
(slice("begin", "end"), "str"),
(np.array(["col_0", "col_1", "col_2"]), "str"),
(np.array(["col_0", "col_1", "col_2"], dtype=object), "str"),
],
)
def test_determine_key_type(key, dtype):
assert _determine_key_type(key) == dtype
def test_determine_key_type_error():
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(1.0)
def test_determine_key_type_slice_error():
with pytest.raises(TypeError, match="Only array-like or scalar are"):
_determine_key_type(slice(0, 2, 1), accept_slice=False)
@skip_if_array_api_compat_not_configured
@pytest.mark.parametrize(
"array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
)
def test_determine_key_type_array_api(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
with sklearn.config_context(array_api_dispatch=True):
int_array_key = xp.asarray([1, 2, 3])
assert _determine_key_type(int_array_key) == "int"
bool_array_key = xp.asarray([True, False, True])
assert _determine_key_type(bool_array_key) == "bool"
try:
complex_array_key = xp.asarray([1 + 1j, 2 + 2j, 3 + 3j])
except TypeError:
# Complex numbers are not supported by all Array API libraries.
complex_array_key = None
if complex_array_key is not None:
with pytest.raises(ValueError, match="No valid specification of the"):
_determine_key_type(complex_array_key)
@pytest.mark.parametrize(
"array_type", ["list", "array", "sparse", "dataframe", "polars"]
)
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_2d_container_axis_0(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(
subset, _convert_container([[4, 5, 6], [7, 8, 9]], array_type)
)
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
def test_safe_indexing_1d_container(array_type, indices_type):
indices = [1, 2]
if indices_type == "slice" and isinstance(indices[1], int):
indices[1] += 1
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series", "slice"])
@pytest.mark.parametrize("indices", [[1, 2], ["col_1", "col_2"]])
def test_safe_indexing_2d_container_axis_1(array_type, indices_type, indices):
# validation of the indices
# we make a copy because indices is mutable and shared between tests
indices_converted = copy(indices)
if indices_type == "slice" and isinstance(indices[1], int):
indices_converted[1] += 1
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices_converted = _convert_container(indices_converted, indices_type)
if isinstance(indices[0], str) and array_type not in ("dataframe", "polars"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices_converted, axis=1)
else:
subset = _safe_indexing(array, indices_converted, axis=1)
assert_allclose_dense_sparse(
subset, _convert_container([[2, 3], [5, 6], [8, 9]], array_type)
)
@pytest.mark.parametrize("array_read_only", [True, False])
@pytest.mark.parametrize("indices_read_only", [True, False])
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["array", "series"])
@pytest.mark.parametrize(
"axis, expected_array", [(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])]
)
def test_safe_indexing_2d_read_only_axis_1(
array_read_only, indices_read_only, array_type, indices_type, axis, expected_array
):
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
if array_read_only:
array.setflags(write=False)
array = _convert_container(array, array_type)
indices = np.array([1, 2])
if indices_read_only:
indices.setflags(write=False)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(subset, _convert_container(expected_array, array_type))
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
def test_safe_indexing_1d_container_mask(array_type, indices_type):
indices = [False] + [True] * 2 + [False] * 6
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=0)
assert_allclose_dense_sparse(subset, _convert_container([2, 3], array_type))
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe", "polars"])
@pytest.mark.parametrize("indices_type", ["list", "tuple", "array", "series"])
@pytest.mark.parametrize(
"axis, expected_subset",
[(0, [[4, 5, 6], [7, 8, 9]]), (1, [[2, 3], [5, 6], [8, 9]])],
)
def test_safe_indexing_2d_mask(array_type, indices_type, axis, expected_subset):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
indices = [False, True, True]
indices = _convert_container(indices, indices_type)
subset = _safe_indexing(array, indices, axis=axis)
assert_allclose_dense_sparse(
subset, _convert_container(expected_subset, array_type)
)
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("list", "list"),
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
],
)
def test_safe_indexing_2d_scalar_axis_0(array_type, expected_output_type):
array = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
expected_array = _convert_container([7, 8, 9], expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "series", "polars_series"])
def test_safe_indexing_1d_scalar(array_type):
array = _convert_container([1, 2, 3, 4, 5, 6, 7, 8, 9], array_type)
indices = 2
subset = _safe_indexing(array, indices, axis=0)
assert subset == 3
@pytest.mark.parametrize(
"array_type, expected_output_type",
[
("array", "array"),
("sparse", "sparse"),
("dataframe", "series"),
("polars", "polars_series"),
],
)
@pytest.mark.parametrize("indices", [2, "col_2"])
def test_safe_indexing_2d_scalar_axis_1(array_type, expected_output_type, indices):
columns_name = ["col_0", "col_1", "col_2"]
array = _convert_container(
[[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type, columns_name
)
if isinstance(indices, str) and array_type not in ("dataframe", "polars"):
err_msg = (
"Specifying the columns using strings is only supported for dataframes"
)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=1)
else:
subset = _safe_indexing(array, indices, axis=1)
expected_output = [3, 6, 9]
if expected_output_type == "sparse":
# sparse matrix are keeping the 2D shape
expected_output = [[3], [6], [9]]
expected_array = _convert_container(expected_output, expected_output_type)
assert_allclose_dense_sparse(subset, expected_array)
@pytest.mark.parametrize("array_type", ["list", "array", "sparse"])
def test_safe_indexing_None_axis_0(array_type):
X = _convert_container([[1, 2, 3], [4, 5, 6], [7, 8, 9]], array_type)
X_subset = _safe_indexing(X, None, axis=0)
assert_allclose_dense_sparse(X_subset, X)
def test_safe_indexing_pandas_no_matching_cols_error():
pd = pytest.importorskip("pandas")
err_msg = "No valid specification of the columns."
X = pd.DataFrame(X_toy)
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, [1.0], axis=1)
@pytest.mark.parametrize("axis", [None, 3])
def test_safe_indexing_error_axis(axis):
with pytest.raises(ValueError, match="'axis' should be either 0"):
_safe_indexing(X_toy, [0, 1], axis=axis)
@pytest.mark.parametrize("X_constructor", ["array", "series", "polars_series"])
def test_safe_indexing_1d_array_error(X_constructor):
# check that we are raising an error if the array-like passed is 1D and
# we try to index on the 2nd dimension
X = list(range(5))
if X_constructor == "array":
X_constructor = np.asarray(X)
elif X_constructor == "series":
pd = pytest.importorskip("pandas")
X_constructor = pd.Series(X)
elif X_constructor == "polars_series":
pl = pytest.importorskip("polars")
X_constructor = pl.Series(values=X)
err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or dataframe"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X_constructor, [0, 1], axis=1)
def test_safe_indexing_container_axis_0_unsupported_type():
indices = ["col_1", "col_2"]
array = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
err_msg = "String indexing is not supported with 'axis=0'"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(array, indices, axis=0)
def test_safe_indexing_pandas_no_settingwithcopy_warning():
# Using safe_indexing with an array-like indexer gives a copy of the
# DataFrame -> ensure it doesn't raise a warning if modified
pd = pytest.importorskip("pandas")
pd_version = parse_version(pd.__version__)
pd_base_version = parse_version(pd_version.base_version)
if pd_base_version >= parse_version("3"):
raise SkipTest("SettingWithCopyWarning has been removed in pandas 3.0.0.dev")
X = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
subset = _safe_indexing(X, [0, 1], axis=0)
if hasattr(pd.errors, "SettingWithCopyWarning"):
SettingWithCopyWarning = pd.errors.SettingWithCopyWarning
else:
# backward compatibility for pandas < 1.5
SettingWithCopyWarning = pd.core.common.SettingWithCopyWarning
with warnings.catch_warnings():
warnings.simplefilter("error", SettingWithCopyWarning)
subset.iloc[0, 0] = 10
# The original dataframe is unaffected by the assignment on the subset:
assert X.iloc[0, 0] == 1
@pytest.mark.parametrize("indices", [0, [0, 1], slice(0, 2), np.array([0, 1])])
def test_safe_indexing_list_axis_1_unsupported(indices):
"""Check that we raise a ValueError when axis=1 with input as list."""
X = [[1, 2], [4, 5], [7, 8]]
err_msg = "axis=1 is not supported for lists"
with pytest.raises(ValueError, match=err_msg):
_safe_indexing(X, indices, axis=1)
@pytest.mark.parametrize("array_type", ["array", "sparse", "dataframe"])
def test_safe_assign(array_type):
"""Check that `_safe_assign` works as expected."""
rng = np.random.RandomState(0)
X_array = rng.randn(10, 5)
row_indexer = [1, 2]
values = rng.randn(len(row_indexer), X_array.shape[1])
X = _convert_container(X_array, array_type)
_safe_assign(X, values, row_indexer=row_indexer)
assigned_portion = _safe_indexing(X, row_indexer, axis=0)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
column_indexer = [1, 2]
values = rng.randn(X_array.shape[0], len(column_indexer))
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assigned_portion = _safe_indexing(X, column_indexer, axis=1)
assert_allclose_dense_sparse(
assigned_portion, _convert_container(values, array_type)
)
row_indexer, column_indexer = None, None
values = rng.randn(*X.shape)
X = _convert_container(X_array, array_type)
_safe_assign(X, values, column_indexer=column_indexer)
assert_allclose_dense_sparse(X, _convert_container(values, array_type))
@pytest.mark.parametrize(
"key, err_msg",
[
(10, r"all features must be in \[0, 2\]"),
("whatever", "A given column is not a column of the dataframe"),
(object(), "No valid specification of the columns"),
],
)
def test_get_column_indices_error(key, err_msg):
pd = pytest.importorskip("pandas")
X_df = pd.DataFrame(X_toy, columns=["col_0", "col_1", "col_2"])
with pytest.raises(ValueError, match=err_msg):
_get_column_indices(X_df, key)
@pytest.mark.parametrize(
"key", [["col1"], ["col2"], ["col1", "col2"], ["col1", "col3"], ["col2", "col3"]]
)
def test_get_column_indices_pandas_nonunique_columns_error(key):
pd = pytest.importorskip("pandas")
toy = np.zeros((1, 5), dtype=int)
columns = ["col1", "col1", "col2", "col3", "col2"]
X = pd.DataFrame(toy, columns=columns)
err_msg = "Selected columns, {}, are not unique in dataframe".format(key)
with pytest.raises(ValueError) as exc_info:
_get_column_indices(X, key)
assert str(exc_info.value) == err_msg
def test_get_column_indices_interchange():
"""Check _get_column_indices for edge cases with the interchange"""
pd = pytest.importorskip("pandas", minversion="1.5")
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"])
# Hide the fact that this is a pandas dataframe to trigger the dataframe protocol
# code path.
class MockDataFrame:
def __init__(self, df):
self._df = df
def __getattr__(self, name):
return getattr(self._df, name)
df_mocked = MockDataFrame(df)
key_results = [
(slice(1, None), [1, 2]),
(slice(None, 2), [0, 1]),
(slice(1, 2), [1]),
(["b", "c"], [1, 2]),
(slice("a", "b"), [0, 1]),
(slice("a", None), [0, 1, 2]),
(slice(None, "a"), [0]),
(["c", "a"], [2, 0]),
([], []),
]
for key, result in key_results:
assert _get_column_indices(df_mocked, key) == result
msg = "A given column is not a column of the dataframe"
with pytest.raises(ValueError, match=msg):
_get_column_indices(df_mocked, ["not_a_column"])
msg = "key.step must be 1 or None"
with pytest.raises(NotImplementedError, match=msg):
_get_column_indices(df_mocked, slice("a", None, 2))
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
with pytest.raises(ValueError):
resample([0], [0, 1])
with pytest.raises(ValueError):
resample([0, 1], [0, 1], replace=False, n_samples=3)
# Issue:6581, n_samples can be more when replace is True (default).
assert len(resample([1, 2], n_samples=5)) == 5
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = 0.9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
def test_resample_stratified_replace():
# Make sure stratified resampling supports the replace parameter
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=n_samples)
X_replace, _ = resample(
X, y, replace=True, n_samples=50, random_state=rng, stratify=y
)
X_no_replace, _ = resample(
X, y, replace=False, n_samples=50, random_state=rng, stratify=y
)
assert np.unique(X_replace).shape[0] < 50
assert np.unique(X_no_replace).shape[0] == 50
# make sure n_samples can be greater than X.shape[0] if we sample with
# replacement
X_replace, _ = resample(
X, y, replace=True, n_samples=1000, random_state=rng, stratify=y
)
assert X_replace.shape[0] == 1000
assert np.unique(X_replace).shape[0] == 100
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_resample_stratify_sparse_error(csr_container):
# resample must be ndarray
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 2))
y = rng.randint(0, 2, size=n_samples)
stratify = csr_container(y.reshape(-1, 1))
with pytest.raises(TypeError, match="Sparse data was passed"):
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
def test_shuffle_on_ndim_equals_three():
def to_tuple(A): # to make the inner arrays hashable
return tuple(tuple(tuple(C) for C in B) for B in A)
A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2)
S = set(to_tuple(A))
shuffle(A) # shouldn't raise a ValueError for dim = 3
assert set(to_tuple(A)) == S
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_shuffle_dont_convert_to_array(csc_container):
# Check that shuffle does not try to convert to numpy arrays with float
# dtypes can let any indexable datastructure pass-through.
a = ["a", "b", "c"]
b = np.array(["a", "b", "c"], dtype=object)
c = [1, 2, 3]
d = MockDataFrame(np.array([["a", 0], ["b", 1], ["c", 2]], dtype=object))
e = csc_container(np.arange(6).reshape(3, 2))
a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0)
assert a_s == ["c", "b", "a"]
assert type(a_s) == list # noqa: E721
assert_array_equal(b_s, ["c", "b", "a"])
assert b_s.dtype == object
assert c_s == [3, 2, 1]
assert type(c_s) == list # noqa: E721
assert_array_equal(d_s, np.array([["c", 2], ["b", 1], ["a", 0]], dtype=object))
assert type(d_s) == MockDataFrame # noqa: E721
assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))

View File

@@ -0,0 +1,19 @@
import pytest
from sklearn.utils._mask import safe_mask
from sklearn.utils.fixes import CSR_CONTAINERS
from sklearn.utils.validation import check_random_state
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_safe_mask(csr_container):
random_state = check_random_state(0)
X = random_state.rand(5, 4)
X_csr = csr_container(X)
mask = [False, False, True, True, True]
mask = safe_mask(X, mask)
assert X[mask].shape[0] == 3
mask = safe_mask(X_csr, mask)
assert X_csr[mask].shape[0] == 3

View File

@@ -0,0 +1,63 @@
import pickle
import pytest
from sklearn.utils.metaestimators import available_if
class AvailableParameterEstimator:
"""This estimator's `available` parameter toggles the presence of a method"""
def __init__(self, available=True, return_value=1):
self.available = available
self.return_value = return_value
@available_if(lambda est: est.available)
def available_func(self):
"""This is a mock available_if function"""
return self.return_value
def test_available_if_docstring():
assert "This is a mock available_if function" in str(
AvailableParameterEstimator.__dict__["available_func"].__doc__
)
assert "This is a mock available_if function" in str(
AvailableParameterEstimator.available_func.__doc__
)
assert "This is a mock available_if function" in str(
AvailableParameterEstimator().available_func.__doc__
)
def test_available_if():
assert hasattr(AvailableParameterEstimator(), "available_func")
assert not hasattr(AvailableParameterEstimator(available=False), "available_func")
def test_available_if_unbound_method():
# This is a non regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/20614
# to make sure that decorated functions can be used as an unbound method,
# for instance when monkeypatching.
est = AvailableParameterEstimator()
AvailableParameterEstimator.available_func(est)
est = AvailableParameterEstimator(available=False)
with pytest.raises(
AttributeError,
match="This 'AvailableParameterEstimator' has no attribute 'available_func'",
):
AvailableParameterEstimator.available_func(est)
def test_available_if_methods_can_be_pickled():
"""Check that available_if methods can be pickled.
Non-regression test for #21344.
"""
return_value = 10
est = AvailableParameterEstimator(available=True, return_value=return_value)
pickled_bytes = pickle.dumps(est.available_func)
unpickled_func = pickle.loads(pickled_bytes)
assert unpickled_func() == return_value

View File

@@ -0,0 +1,27 @@
import numpy as np
import pytest
from sklearn.utils._missing import is_scalar_nan
@pytest.mark.parametrize(
"value, result",
[
(float("nan"), True),
(np.nan, True),
(float(np.nan), True),
(np.float32(np.nan), True),
(np.float64(np.nan), True),
(0, False),
(0.0, False),
(None, False),
("", False),
("nan", False),
([np.nan], False),
(9867966753463435747313673, False), # Python int that overflows with C type
],
)
def test_is_scalar_nan(value, result):
assert is_scalar_nan(value) is result
# make sure that we are returning a Python bool
assert isinstance(is_scalar_nan(value), bool)

View File

@@ -0,0 +1,205 @@
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from scipy import sparse
from sklearn.datasets import load_iris
from sklearn.utils import _safe_indexing, check_array
from sklearn.utils._mocking import (
CheckingClassifier,
_MockEstimatorOnOffPrediction,
)
from sklearn.utils._testing import _convert_container
from sklearn.utils.fixes import CSR_CONTAINERS
@pytest.fixture
def iris():
return load_iris(return_X_y=True)
def _success(x):
return True
def _fail(x):
return False
@pytest.mark.parametrize(
"kwargs",
[
{},
{"check_X": _success},
{"check_y": _success},
{"check_X": _success, "check_y": _success},
],
)
def test_check_on_fit_success(iris, kwargs):
X, y = iris
CheckingClassifier(**kwargs).fit(X, y)
@pytest.mark.parametrize(
"kwargs",
[
{"check_X": _fail},
{"check_y": _fail},
{"check_X": _success, "check_y": _fail},
{"check_X": _fail, "check_y": _success},
{"check_X": _fail, "check_y": _fail},
],
)
def test_check_on_fit_fail(iris, kwargs):
X, y = iris
clf = CheckingClassifier(**kwargs)
with pytest.raises(AssertionError):
clf.fit(X, y)
@pytest.mark.parametrize(
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
)
def test_check_X_on_predict_success(iris, pred_func):
X, y = iris
clf = CheckingClassifier(check_X=_success).fit(X, y)
getattr(clf, pred_func)(X)
@pytest.mark.parametrize(
"pred_func", ["predict", "predict_proba", "decision_function", "score"]
)
def test_check_X_on_predict_fail(iris, pred_func):
X, y = iris
clf = CheckingClassifier(check_X=_success).fit(X, y)
clf.set_params(check_X=_fail)
with pytest.raises(AssertionError):
getattr(clf, pred_func)(X)
@pytest.mark.parametrize("input_type", ["list", "array", "sparse", "dataframe"])
def test_checking_classifier(iris, input_type):
# Check that the CheckingClassifier outputs what we expect
X, y = iris
X = _convert_container(X, input_type)
clf = CheckingClassifier()
clf.fit(X, y)
assert_array_equal(clf.classes_, np.unique(y))
assert len(clf.classes_) == 3
assert clf.n_features_in_ == 4
y_pred = clf.predict(X)
assert all(pred in clf.classes_ for pred in y_pred)
assert clf.score(X) == pytest.approx(0)
clf.set_params(foo_param=10)
assert clf.fit(X, y).score(X) == pytest.approx(1)
y_proba = clf.predict_proba(X)
assert y_proba.shape == (150, 3)
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
y_decision = clf.decision_function(X)
assert y_decision.shape == (150, 3)
# check the shape in case of binary classification
first_2_classes = np.logical_or(y == 0, y == 1)
X = _safe_indexing(X, first_2_classes)
y = _safe_indexing(y, first_2_classes)
clf.fit(X, y)
y_proba = clf.predict_proba(X)
assert y_proba.shape == (100, 2)
assert np.logical_and(y_proba >= 0, y_proba <= 1).all()
y_decision = clf.decision_function(X)
assert y_decision.shape == (100,)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_checking_classifier_with_params(iris, csr_container):
X, y = iris
X_sparse = csr_container(X)
clf = CheckingClassifier(check_X=sparse.issparse)
with pytest.raises(AssertionError):
clf.fit(X, y)
clf.fit(X_sparse, y)
clf = CheckingClassifier(
check_X=check_array, check_X_params={"accept_sparse": False}
)
clf.fit(X, y)
with pytest.raises(TypeError, match="Sparse data was passed"):
clf.fit(X_sparse, y)
def test_checking_classifier_fit_params(iris):
# check the error raised when the number of samples is not the one expected
X, y = iris
clf = CheckingClassifier(expected_sample_weight=True)
sample_weight = np.ones(len(X) // 2)
msg = f"sample_weight.shape == ({len(X) // 2},), expected ({len(X)},)!"
with pytest.raises(ValueError) as exc:
clf.fit(X, y, sample_weight=sample_weight)
assert exc.value.args[0] == msg
def test_checking_classifier_missing_fit_params(iris):
X, y = iris
clf = CheckingClassifier(expected_sample_weight=True)
err_msg = "Expected sample_weight to be passed"
with pytest.raises(AssertionError, match=err_msg):
clf.fit(X, y)
@pytest.mark.parametrize(
"methods_to_check",
[["predict"], ["predict", "predict_proba"]],
)
@pytest.mark.parametrize(
"predict_method", ["predict", "predict_proba", "decision_function", "score"]
)
def test_checking_classifier_methods_to_check(iris, methods_to_check, predict_method):
# check that methods_to_check allows to bypass checks
X, y = iris
clf = CheckingClassifier(
check_X=sparse.issparse,
methods_to_check=methods_to_check,
)
clf.fit(X, y)
if predict_method in methods_to_check:
with pytest.raises(AssertionError):
getattr(clf, predict_method)(X)
else:
getattr(clf, predict_method)(X)
@pytest.mark.parametrize(
"response_methods",
[
["predict"],
["predict", "predict_proba"],
["predict", "decision_function"],
["predict", "predict_proba", "decision_function"],
],
)
def test_mock_estimator_on_off_prediction(iris, response_methods):
X, y = iris
estimator = _MockEstimatorOnOffPrediction(response_methods=response_methods)
estimator.fit(X, y)
assert hasattr(estimator, "classes_")
assert_array_equal(estimator.classes_, np.unique(y))
possible_responses = ["predict", "predict_proba", "decision_function"]
for response in possible_responses:
if response in response_methods:
assert hasattr(estimator, response)
assert getattr(estimator, response)(X) == response
else:
assert not hasattr(estimator, response)

View File

@@ -0,0 +1,613 @@
from itertools import product
import numpy as np
import pytest
from scipy.sparse import issparse
from sklearn import config_context, datasets
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC
from sklearn.utils._array_api import yield_namespace_device_dtype_combinations
from sklearn.utils._testing import (
_array_api_for_tests,
_convert_container,
assert_allclose,
assert_array_almost_equal,
assert_array_equal,
)
from sklearn.utils.estimator_checks import _NotAnArray
from sklearn.utils.fixes import (
COO_CONTAINERS,
CSC_CONTAINERS,
CSR_CONTAINERS,
DOK_CONTAINERS,
LIL_CONTAINERS,
)
from sklearn.utils.metaestimators import _safe_split
from sklearn.utils.multiclass import (
_ovr_decision_function,
check_classification_targets,
class_distribution,
is_multilabel,
type_of_target,
unique_labels,
)
multilabel_explicit_zero = np.array([[0, 1], [1, 0]])
multilabel_explicit_zero[:, 0] = 0
def _generate_sparse(
data,
sparse_containers=tuple(
COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
),
dtypes=(bool, int, np.int8, np.uint8, float, np.float32),
):
return [
sparse_container(data, dtype=dtype)
for sparse_container in sparse_containers
for dtype in dtypes
]
EXAMPLES = {
"multilabel-indicator": [
# valid when the data is formatted as sparse or dense, identified
# by CSR format when the testing takes place
*_generate_sparse(
np.random.RandomState(42).randint(2, size=(10, 10)),
sparse_containers=CSR_CONTAINERS,
dtypes=(int,),
),
[[0, 1], [1, 0]],
[[0, 1]],
*_generate_sparse(
multilabel_explicit_zero, sparse_containers=CSC_CONTAINERS, dtypes=(int,)
),
*_generate_sparse([[0, 1], [1, 0]]),
*_generate_sparse([[0, 0], [0, 0]]),
*_generate_sparse([[0, 1]]),
# Only valid when data is dense
[[-1, 1], [1, -1]],
np.array([[-1, 1], [1, -1]]),
np.array([[-3, 3], [3, -3]]),
_NotAnArray(np.array([[-3, 3], [3, -3]])),
],
"multiclass": [
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
np.array([1, 0, 2]),
np.array([1, 0, 2], dtype=np.int8),
np.array([1, 0, 2], dtype=np.uint8),
np.array([1, 0, 2], dtype=float),
np.array([1, 0, 2], dtype=np.float32),
np.array([[1], [0], [2]]),
_NotAnArray(np.array([1, 0, 2])),
[0, 1, 2],
["a", "b", "c"],
np.array(["a", "b", "c"]),
np.array(["a", "b", "c"], dtype=object),
np.array(["a", "b", "c"], dtype=object),
],
"multiclass-multioutput": [
[[1, 0, 2, 2], [1, 4, 2, 4]],
[["a", "b"], ["c", "d"]],
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
*_generate_sparse(
[[1, 0, 2, 2], [1, 4, 2, 4]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(int, np.int8, np.uint8, float, np.float32),
),
np.array([["a", "b"], ["c", "d"]]),
np.array([["a", "b"], ["c", "d"]]),
np.array([["a", "b"], ["c", "d"]], dtype=object),
np.array([[1, 0, 2]]),
_NotAnArray(np.array([[1, 0, 2]])),
],
"binary": [
[0, 1],
[1, 1],
[],
[0],
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
np.array([[0], [1]]),
_NotAnArray(np.array([[0], [1]])),
[1, -1],
[3, 5],
["a"],
["a", "b"],
["abc", "def"],
np.array(["abc", "def"]),
["a", "b"],
np.array(["abc", "def"], dtype=object),
],
"continuous": [
[1e-5],
[0, 0.5],
np.array([[0], [0.5]]),
np.array([[0], [0.5]], dtype=np.float32),
],
"continuous-multioutput": [
np.array([[0, 0.5], [0.5, 0]]),
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
np.array([[0, 0.5]]),
*_generate_sparse(
[[0, 0.5], [0.5, 0]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(float, np.float32),
),
*_generate_sparse(
[[0, 0.5]],
sparse_containers=CSC_CONTAINERS + CSR_CONTAINERS,
dtypes=(float, np.float32),
),
],
"unknown": [
[[]],
np.array([[]], dtype=object),
[()],
# sequence of sequences that weren't supported even before deprecation
np.array([np.array([]), np.array([1, 2, 3])], dtype=object),
[np.array([]), np.array([1, 2, 3])],
[{1, 2, 3}, {1, 2}],
[frozenset([1, 2, 3]), frozenset([1, 2])],
# and also confusable as sequences of sequences
[{0: "a", 1: "b"}, {0: "a"}],
# ndim 0
np.array(0),
# empty second dimension
np.array([[], []]),
# 3d
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
],
}
ARRAY_API_EXAMPLES = {
"multilabel-indicator": [
np.random.RandomState(42).randint(2, size=(10, 10)),
[[0, 1], [1, 0]],
[[0, 1]],
multilabel_explicit_zero,
[[0, 0], [0, 0]],
[[-1, 1], [1, -1]],
np.array([[-1, 1], [1, -1]]),
np.array([[-3, 3], [3, -3]]),
_NotAnArray(np.array([[-3, 3], [3, -3]])),
],
"multiclass": [
[1, 0, 2, 2, 1, 4, 2, 4, 4, 4],
np.array([1, 0, 2]),
np.array([1, 0, 2], dtype=np.int8),
np.array([1, 0, 2], dtype=np.uint8),
np.array([1, 0, 2], dtype=float),
np.array([1, 0, 2], dtype=np.float32),
np.array([[1], [0], [2]]),
_NotAnArray(np.array([1, 0, 2])),
[0, 1, 2],
],
"multiclass-multioutput": [
[[1, 0, 2, 2], [1, 4, 2, 4]],
np.array([[1, 0, 2, 2], [1, 4, 2, 4]]),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.int8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.uint8),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=float),
np.array([[1, 0, 2, 2], [1, 4, 2, 4]], dtype=np.float32),
np.array([[1, 0, 2]]),
_NotAnArray(np.array([[1, 0, 2]])),
],
"binary": [
[0, 1],
[1, 1],
[],
[0],
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1]),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=bool),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.int8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.uint8),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=float),
np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1], dtype=np.float32),
np.array([[0], [1]]),
_NotAnArray(np.array([[0], [1]])),
[1, -1],
[3, 5],
],
"continuous": [
[1e-5],
[0, 0.5],
np.array([[0], [0.5]]),
np.array([[0], [0.5]], dtype=np.float32),
],
"continuous-multioutput": [
np.array([[0, 0.5], [0.5, 0]]),
np.array([[0, 0.5], [0.5, 0]], dtype=np.float32),
np.array([[0, 0.5]]),
],
"unknown": [
[[]],
[()],
np.array(0),
np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]]),
],
}
NON_ARRAY_LIKE_EXAMPLES = [
{1, 2, 3},
{0: "a", 1: "b"},
{0: [5], 1: [5]},
"abc",
frozenset([1, 2, 3]),
None,
]
MULTILABEL_SEQUENCES = [
[[1], [2], [0, 1]],
[(), (2), (0, 1)],
np.array([[], [1, 2]], dtype="object"),
_NotAnArray(np.array([[], [1, 2]], dtype="object")),
]
def test_unique_labels():
# Empty iterable
with pytest.raises(ValueError):
unique_labels()
# Multiclass problem
assert_array_equal(unique_labels(range(10)), np.arange(10))
assert_array_equal(unique_labels(np.arange(10)), np.arange(10))
assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4]))
# Multilabel indicator
assert_array_equal(
unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)
)
assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3))
# Several arrays passed
assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5))
assert_array_equal(unique_labels((0, 1, 2), (0,), (2, 1)), np.arange(3))
# Border line case with binary indicator matrix
with pytest.raises(ValueError):
unique_labels([4, 0, 2], np.ones((5, 5)))
with pytest.raises(ValueError):
unique_labels(np.ones((5, 4)), np.ones((5, 5)))
assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))
def test_unique_labels_non_specific():
# Test unique_labels with a variety of collected examples
# Smoke test for all supported format
for format in ["binary", "multiclass", "multilabel-indicator"]:
for y in EXAMPLES[format]:
unique_labels(y)
# We don't support those format at the moment
for example in NON_ARRAY_LIKE_EXAMPLES:
with pytest.raises(ValueError):
unique_labels(example)
for y_type in [
"unknown",
"continuous",
"continuous-multioutput",
"multiclass-multioutput",
]:
for example in EXAMPLES[y_type]:
with pytest.raises(ValueError):
unique_labels(example)
def test_unique_labels_mixed_types():
# Mix with binary or multiclass and multilabel
mix_clf_format = product(
EXAMPLES["multilabel-indicator"], EXAMPLES["multiclass"] + EXAMPLES["binary"]
)
for y_multilabel, y_multiclass in mix_clf_format:
with pytest.raises(ValueError):
unique_labels(y_multiclass, y_multilabel)
with pytest.raises(ValueError):
unique_labels(y_multilabel, y_multiclass)
with pytest.raises(ValueError):
unique_labels([[1, 2]], [["a", "d"]])
with pytest.raises(ValueError):
unique_labels(["1", 2])
with pytest.raises(ValueError):
unique_labels([["1", 2], [1, 3]])
with pytest.raises(ValueError):
unique_labels([["1", "2"], [2, 3]])
def test_is_multilabel():
for group, group_examples in EXAMPLES.items():
dense_exp = group == "multilabel-indicator"
for example in group_examples:
# Only mark explicitly defined sparse examples as valid sparse
# multilabel-indicators
sparse_exp = dense_exp and issparse(example)
if issparse(example) or (
hasattr(example, "__array__")
and np.asarray(example).ndim == 2
and np.asarray(example).dtype.kind in "biuf"
and np.asarray(example).shape[1] > 0
):
examples_sparse = [
sparse_container(example)
for sparse_container in (
COO_CONTAINERS
+ CSC_CONTAINERS
+ CSR_CONTAINERS
+ DOK_CONTAINERS
+ LIL_CONTAINERS
)
]
for exmpl_sparse in examples_sparse:
assert sparse_exp == is_multilabel(
exmpl_sparse
), f"is_multilabel({exmpl_sparse!r}) should be {sparse_exp}"
# Densify sparse examples before testing
if issparse(example):
example = example.toarray()
assert dense_exp == is_multilabel(
example
), f"is_multilabel({example!r}) should be {dense_exp}"
@pytest.mark.parametrize(
"array_namespace, device, dtype_name",
yield_namespace_device_dtype_combinations(),
)
def test_is_multilabel_array_api_compliance(array_namespace, device, dtype_name):
xp = _array_api_for_tests(array_namespace, device)
for group, group_examples in ARRAY_API_EXAMPLES.items():
dense_exp = group == "multilabel-indicator"
for example in group_examples:
if np.asarray(example).dtype.kind == "f":
example = np.asarray(example, dtype=dtype_name)
else:
example = np.asarray(example)
example = xp.asarray(example, device=device)
with config_context(array_api_dispatch=True):
assert dense_exp == is_multilabel(
example
), f"is_multilabel({example!r}) should be {dense_exp}"
def test_check_classification_targets():
for y_type in EXAMPLES.keys():
if y_type in ["unknown", "continuous", "continuous-multioutput"]:
for example in EXAMPLES[y_type]:
msg = "Unknown label type: "
with pytest.raises(ValueError, match=msg):
check_classification_targets(example)
else:
for example in EXAMPLES[y_type]:
check_classification_targets(example)
# @ignore_warnings
def test_type_of_target():
for group, group_examples in EXAMPLES.items():
for example in group_examples:
assert (
type_of_target(example) == group
), "type_of_target(%r) should be %r, got %r" % (
example,
group,
type_of_target(example),
)
for example in NON_ARRAY_LIKE_EXAMPLES:
msg_regex = r"Expected array-like \(array or non-string sequence\).*"
with pytest.raises(ValueError, match=msg_regex):
type_of_target(example)
for example in MULTILABEL_SEQUENCES:
msg = (
"You appear to be using a legacy multi-label data "
"representation. Sequence of sequences are no longer supported;"
" use a binary array or sparse matrix instead."
)
with pytest.raises(ValueError, match=msg):
type_of_target(example)
def test_type_of_target_pandas_sparse():
pd = pytest.importorskip("pandas")
y = pd.arrays.SparseArray([1, np.nan, np.nan, 1, np.nan])
msg = "y cannot be class 'SparseSeries' or 'SparseArray'"
with pytest.raises(ValueError, match=msg):
type_of_target(y)
def test_type_of_target_pandas_nullable():
"""Check that type_of_target works with pandas nullable dtypes."""
pd = pytest.importorskip("pandas")
for dtype in ["Int32", "Float32"]:
y_true = pd.Series([1, 0, 2, 3, 4], dtype=dtype)
assert type_of_target(y_true) == "multiclass"
y_true = pd.Series([1, 0, 1, 0], dtype=dtype)
assert type_of_target(y_true) == "binary"
y_true = pd.DataFrame([[1.4, 3.1], [3.1, 1.4]], dtype="Float32")
assert type_of_target(y_true) == "continuous-multioutput"
y_true = pd.DataFrame([[0, 1], [1, 1]], dtype="Int32")
assert type_of_target(y_true) == "multilabel-indicator"
y_true = pd.DataFrame([[1, 2], [3, 1]], dtype="Int32")
assert type_of_target(y_true) == "multiclass-multioutput"
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_unique_labels_pandas_nullable(dtype):
"""Checks that unique_labels work with pandas nullable dtypes.
Non-regression test for gh-25634.
"""
pd = pytest.importorskip("pandas")
y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
y_predicted = pd.Series([0, 0, 1, 1, 0, 1, 1, 1, 1], dtype="int64")
labels = unique_labels(y_true, y_predicted)
assert_array_equal(labels, [0, 1])
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
def test_class_distribution(csc_container):
y = np.array(
[
[1, 0, 0, 1],
[2, 2, 0, 1],
[1, 3, 0, 1],
[4, 2, 0, 1],
[2, 0, 0, 1],
[1, 3, 0, 1],
]
)
# Define the sparse matrix with a mix of implicit and explicit zeros
data = np.array([1, 2, 1, 4, 2, 1, 0, 2, 3, 2, 3, 1, 1, 1, 1, 1, 1])
indices = np.array([0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 4, 5])
indptr = np.array([0, 6, 11, 11, 17])
y_sp = csc_container((data, indices, indptr), shape=(6, 4))
classes, n_classes, class_prior = class_distribution(y)
classes_sp, n_classes_sp, class_prior_sp = class_distribution(y_sp)
classes_expected = [[1, 2, 4], [0, 2, 3], [0], [1]]
n_classes_expected = [3, 3, 1, 1]
class_prior_expected = [[3 / 6, 2 / 6, 1 / 6], [1 / 3, 1 / 3, 1 / 3], [1.0], [1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
# Test again with explicit sample weights
(classes, n_classes, class_prior) = class_distribution(
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
)
(classes_sp, n_classes_sp, class_prior_sp) = class_distribution(
y, [1.0, 2.0, 1.0, 2.0, 1.0, 2.0]
)
class_prior_expected = [[4 / 9, 3 / 9, 2 / 9], [2 / 9, 4 / 9, 3 / 9], [1.0], [1.0]]
for k in range(y.shape[1]):
assert_array_almost_equal(classes[k], classes_expected[k])
assert_array_almost_equal(n_classes[k], n_classes_expected[k])
assert_array_almost_equal(class_prior[k], class_prior_expected[k])
assert_array_almost_equal(classes_sp[k], classes_expected[k])
assert_array_almost_equal(n_classes_sp[k], n_classes_expected[k])
assert_array_almost_equal(class_prior_sp[k], class_prior_expected[k])
def test_safe_split_with_precomputed_kernel():
clf = SVC()
clfp = SVC(kernel="precomputed")
iris = datasets.load_iris()
X, y = iris.data, iris.target
K = np.dot(X, X.T)
cv = ShuffleSplit(test_size=0.25, random_state=0)
train, test = list(cv.split(X))[0]
X_train, y_train = _safe_split(clf, X, y, train)
K_train, y_train2 = _safe_split(clfp, K, y, train)
assert_array_almost_equal(K_train, np.dot(X_train, X_train.T))
assert_array_almost_equal(y_train, y_train2)
X_test, y_test = _safe_split(clf, X, y, test, train)
K_test, y_test2 = _safe_split(clfp, K, y, test, train)
assert_array_almost_equal(K_test, np.dot(X_test, X_train.T))
assert_array_almost_equal(y_test, y_test2)
def test_ovr_decision_function():
# test properties for ovr decision function
predictions = np.array([[0, 1, 1], [0, 1, 0], [0, 1, 1], [0, 1, 1]])
confidences = np.array(
[[-1e16, 0, -1e16], [1.0, 2.0, -3.0], [-5.0, 2.0, 5.0], [-0.5, 0.2, 0.5]]
)
n_classes = 3
dec_values = _ovr_decision_function(predictions, confidences, n_classes)
# check that the decision values are within 0.5 range of the votes
votes = np.array([[1, 0, 2], [1, 1, 1], [1, 0, 2], [1, 0, 2]])
assert_allclose(votes, dec_values, atol=0.5)
# check that the prediction are what we expect
# highest vote or highest confidence if there is a tie.
# for the second sample we have a tie (should be won by 1)
expected_prediction = np.array([2, 1, 2, 2])
assert_array_equal(np.argmax(dec_values, axis=1), expected_prediction)
# third and fourth sample have the same vote but third sample
# has higher confidence, this should reflect on the decision values
assert dec_values[2, 2] > dec_values[3, 2]
# assert subset invariance.
dec_values_one = [
_ovr_decision_function(
np.array([predictions[i]]), np.array([confidences[i]]), n_classes
)[0]
for i in range(4)
]
assert_allclose(dec_values, dec_values_one, atol=1e-6)
# TODO(1.7): Change to ValueError when byte labels is deprecated.
@pytest.mark.parametrize("input_type", ["list", "array"])
def test_labels_in_bytes_format(input_type):
# check that we raise an error with bytes encoded labels
# non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16980
target = _convert_container([b"a", b"b"], input_type)
err_msg = (
"Support for labels represented as bytes is deprecated in v1.5 and will"
" error in v1.7. Convert the labels to a string or integer format."
)
with pytest.warns(FutureWarning, match=err_msg):
type_of_target(target)

View File

@@ -0,0 +1,74 @@
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#
# License: BSD 3 clause
import numpy as np
from numpy.testing import assert_array_almost_equal, assert_array_equal
from sklearn.utils.murmurhash import murmurhash3_32
def test_mmhash3_int():
assert murmurhash3_32(3) == 847579505
assert murmurhash3_32(3, seed=0) == 847579505
assert murmurhash3_32(3, seed=42) == -1823081949
assert murmurhash3_32(3, positive=False) == 847579505
assert murmurhash3_32(3, seed=0, positive=False) == 847579505
assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
assert murmurhash3_32(3, positive=True) == 847579505
assert murmurhash3_32(3, seed=0, positive=True) == 847579505
assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
def test_mmhash3_int_array():
rng = np.random.RandomState(42)
keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
keys = keys.reshape((3, 2, 1))
for seed in [0, 42]:
expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed), expected)
for seed in [0, 42]:
expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
expected = expected.reshape(keys.shape)
assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
def test_mmhash3_bytes():
assert murmurhash3_32(b"foo", 0) == -156908512
assert murmurhash3_32(b"foo", 42) == -1322301282
assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
def test_mmhash3_unicode():
assert murmurhash3_32("foo", 0) == -156908512
assert murmurhash3_32("foo", 42) == -1322301282
assert murmurhash3_32("foo", 0, positive=True) == 4138058784
assert murmurhash3_32("foo", 42, positive=True) == 2972666014
def test_no_collision_on_byte_range():
previous_hashes = set()
for i in range(100):
h = murmurhash3_32(" " * i, 0)
assert h not in previous_hashes, "Found collision on growing empty string"
def test_uniform_distribution():
n_bins, n_samples = 10, 100000
bins = np.zeros(n_bins, dtype=np.float64)
for i in range(n_samples):
bins[murmurhash3_32(i, positive=True) % n_bins] += 1
means = bins / n_samples
expected = np.full(n_bins, 1.0 / n_bins)
assert_array_almost_equal(means / expected, np.ones(n_bins), 2)

View File

@@ -0,0 +1,158 @@
import numpy as np
import pytest
from scipy.optimize import fmin_ncg
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils.optimize import _newton_cg
def test_newton_cg():
# Test that newton_cg gives same result as scipy's fmin_ncg
rng = np.random.RandomState(0)
A = rng.normal(size=(10, 10))
x0 = np.ones(10)
def func(x):
Ax = A.dot(x)
return 0.5 * (Ax).dot(Ax)
def grad(x):
return A.T.dot(A.dot(x))
def hess(x, p):
return p.dot(A.T.dot(A.dot(x.all())))
def grad_hess(x):
return grad(x), lambda x: A.T.dot(A.dot(x))
assert_array_almost_equal(
_newton_cg(grad_hess, func, grad, x0, tol=1e-10)[0],
fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess),
)
@pytest.mark.parametrize("verbose", [0, 1, 2])
def test_newton_cg_verbosity(capsys, verbose):
"""Test the std output of verbose newton_cg solver."""
A = np.eye(2)
b = np.array([1, 2], dtype=float)
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.zeros(A.shape[0]),
verbose=verbose,
) # returns array([1., 2])
captured = capsys.readouterr()
if verbose == 0:
assert captured.out == ""
else:
msg = [
"Newton-CG iter = 1",
"Check Convergence",
"max |gradient|",
"Solver did converge at loss = ",
]
for m in msg:
assert m in captured.out
if verbose >= 2:
msg = [
"Inner CG solver iteration 1 stopped with",
"sum(|residuals|) <= tol",
"Line Search",
"try line search wolfe1",
"wolfe1 line search was successful",
]
for m in msg:
assert m in captured.out
if verbose >= 2:
# Set up a badly scaled singular Hessian with a completely wrong starting
# position. This should trigger 2nd line search check
A = np.array([[1.0, 2], [2, 4]]) * 1e30 # collinear columns
b = np.array([1.0, 2.0])
# Note that scipy.optimize._linesearch LineSearchWarning inherits from
# RuntimeWarning, but we do not want to import from non public APIs.
with pytest.warns(RuntimeWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.array([-2.0, 1]), # null space of hessian
verbose=verbose,
)
captured = capsys.readouterr()
msg = [
"wolfe1 line search was not successful",
"check loss |improvement| <= eps * |loss_old|:",
"check sum(|gradient|) < sum(|gradient_old|):",
"last resort: try line search wolfe2",
]
for m in msg:
assert m in captured.out
# Set up a badly conditioned Hessian that leads to tiny curvature.
# X.T @ X have singular values array([1.00000400e+01, 1.00008192e-11])
A = np.array([[1.0, 2], [1, 2 + 1e-15]])
b = np.array([-2.0, 1])
with pytest.warns(ConvergenceWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=b,
verbose=verbose,
maxiter=2,
)
captured = capsys.readouterr()
msg = [
"tiny_|p| = eps * ||p||^2",
]
for m in msg:
assert m in captured.out
# Test for a case with negative Hessian.
# We do not trigger "Inner CG solver iteration {i} stopped with negative
# curvature", but that is very hard to trigger.
A = np.eye(2)
b = np.array([-2.0, 1])
with pytest.warns(RuntimeWarning):
_newton_cg(
# Note the wrong sign in the hessian product.
grad_hess=lambda x: (A @ x - b, lambda z: -A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.array([1.0, 1.0]),
verbose=verbose,
maxiter=3,
)
captured = capsys.readouterr()
msg = [
"Inner CG solver iteration 0 fell back to steepest descent",
]
for m in msg:
assert m in captured.out
A = np.diag([1e-3, 1, 1e3])
b = np.array([-2.0, 1, 2.0])
with pytest.warns(ConvergenceWarning):
_newton_cg(
grad_hess=lambda x: (A @ x - b, lambda z: A @ z),
func=lambda x: 0.5 * x @ A @ x - b @ x,
grad=lambda x: A @ x - b,
x0=np.ones_like(b),
verbose=verbose,
maxiter=2,
maxinner=1,
)
captured = capsys.readouterr()
msg = [
"Inner CG solver stopped reaching maxiter=1",
]
for m in msg:
assert m in captured.out

View File

@@ -0,0 +1,100 @@
import time
import joblib
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn import config_context, get_config
from sklearn.compose import make_column_transformer
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.parallel import Parallel, delayed
def get_working_memory():
return get_config()["working_memory"]
@pytest.mark.parametrize("n_jobs", [1, 2])
@pytest.mark.parametrize("backend", ["loky", "threading", "multiprocessing"])
def test_configuration_passes_through_to_joblib(n_jobs, backend):
# Tests that the global global configuration is passed to joblib jobs
with config_context(working_memory=123):
results = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(get_working_memory)() for _ in range(2)
)
assert_array_equal(results, [123] * 2)
def test_parallel_delayed_warnings():
"""Informative warnings should be raised when mixing sklearn and joblib API"""
# We should issue a warning when one wants to use sklearn.utils.fixes.Parallel
# with joblib.delayed. The config will not be propagated to the workers.
warn_msg = "`sklearn.utils.parallel.Parallel` needs to be used in conjunction"
with pytest.warns(UserWarning, match=warn_msg) as records:
Parallel()(joblib.delayed(time.sleep)(0) for _ in range(10))
assert len(records) == 10
# We should issue a warning if one wants to use sklearn.utils.fixes.delayed with
# joblib.Parallel
warn_msg = (
"`sklearn.utils.parallel.delayed` should be used with "
"`sklearn.utils.parallel.Parallel` to make it possible to propagate"
)
with pytest.warns(UserWarning, match=warn_msg) as records:
joblib.Parallel()(delayed(time.sleep)(0) for _ in range(10))
assert len(records) == 10
@pytest.mark.parametrize("n_jobs", [1, 2])
def test_dispatch_config_parallel(n_jobs):
"""Check that we properly dispatch the configuration in parallel processing.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/issues/25239
"""
pd = pytest.importorskip("pandas")
iris = load_iris(as_frame=True)
class TransformerRequiredDataFrame(StandardScaler):
def fit(self, X, y=None):
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
return super().fit(X, y)
def transform(self, X, y=None):
assert isinstance(X, pd.DataFrame), "X should be a DataFrame"
return super().transform(X, y)
dropper = make_column_transformer(
("drop", [0]),
remainder="passthrough",
n_jobs=n_jobs,
)
param_grid = {"randomforestclassifier__max_depth": [1, 2, 3]}
search_cv = GridSearchCV(
make_pipeline(
dropper,
TransformerRequiredDataFrame(),
RandomForestClassifier(n_estimators=5, n_jobs=n_jobs),
),
param_grid,
cv=5,
n_jobs=n_jobs,
error_score="raise", # this search should not fail
)
# make sure that `fit` would fail in case we don't request dataframe
with pytest.raises(AssertionError, match="X should be a DataFrame"):
search_cv.fit(iris.data, iris.target)
with config_context(transform_output="pandas"):
# we expect each intermediate steps to output a DataFrame
search_cv.fit(iris.data, iris.target)
assert not np.isnan(search_cv.cv_results_["mean_test_score"]).any()

View File

@@ -0,0 +1,785 @@
from numbers import Integral, Real
import numpy as np
import pytest
from scipy.sparse import csr_matrix
from sklearn._config import config_context, get_config
from sklearn.base import BaseEstimator, _fit_context
from sklearn.model_selection import LeaveOneOut
from sklearn.utils import deprecated
from sklearn.utils._param_validation import (
HasMethods,
Hidden,
Interval,
InvalidParameterError,
MissingValues,
Options,
RealNotInt,
StrOptions,
_ArrayLikes,
_Booleans,
_Callables,
_CVObjects,
_InstancesOf,
_IterablesNotString,
_NanConstraint,
_NoneConstraint,
_PandasNAConstraint,
_RandomStates,
_SparseMatrices,
_VerboseHelper,
generate_invalid_param_val,
generate_valid_param,
make_constraint,
validate_params,
)
from sklearn.utils.fixes import CSR_CONTAINERS
# Some helpers for the tests
@validate_params(
{"a": [Real], "b": [Real], "c": [Real], "d": [Real]},
prefer_skip_nested_validation=True,
)
def _func(a, b=0, *args, c, d=0, **kwargs):
"""A function to test the validation of functions."""
class _Class:
"""A class to test the _InstancesOf constraint and the validation of methods."""
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
def _method(self, a):
"""A validated method"""
@deprecated()
@validate_params({"a": [Real]}, prefer_skip_nested_validation=True)
def _deprecated_method(self, a):
"""A deprecated validated method"""
class _Estimator(BaseEstimator):
"""An estimator to test the validation of estimator parameters."""
_parameter_constraints: dict = {"a": [Real]}
def __init__(self, a):
self.a = a
@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X=None, y=None):
pass
@pytest.mark.parametrize("interval_type", [Integral, Real])
def test_interval_range(interval_type):
"""Check the range of values depending on closed."""
interval = Interval(interval_type, -2, 2, closed="left")
assert -2 in interval
assert 2 not in interval
interval = Interval(interval_type, -2, 2, closed="right")
assert -2 not in interval
assert 2 in interval
interval = Interval(interval_type, -2, 2, closed="both")
assert -2 in interval
assert 2 in interval
interval = Interval(interval_type, -2, 2, closed="neither")
assert -2 not in interval
assert 2 not in interval
@pytest.mark.parametrize("interval_type", [Integral, Real])
def test_interval_large_integers(interval_type):
"""Check that Interval constraint work with large integers.
non-regression test for #26648.
"""
interval = Interval(interval_type, 0, 2, closed="neither")
assert 2**65 not in interval
assert 2**128 not in interval
assert float(2**65) not in interval
assert float(2**128) not in interval
interval = Interval(interval_type, 0, 2**128, closed="neither")
assert 2**65 in interval
assert 2**128 not in interval
assert float(2**65) in interval
assert float(2**128) not in interval
assert 2**1024 not in interval
def test_interval_inf_in_bounds():
"""Check that inf is included iff a bound is closed and set to None.
Only valid for real intervals.
"""
interval = Interval(Real, 0, None, closed="right")
assert np.inf in interval
interval = Interval(Real, None, 0, closed="left")
assert -np.inf in interval
interval = Interval(Real, None, None, closed="neither")
assert np.inf not in interval
assert -np.inf not in interval
@pytest.mark.parametrize(
"interval",
[Interval(Real, 0, 1, closed="left"), Interval(Real, None, None, closed="both")],
)
def test_nan_not_in_interval(interval):
"""Check that np.nan is not in any interval."""
assert np.nan not in interval
@pytest.mark.parametrize(
"params, error, match",
[
(
{"type": Integral, "left": 1.0, "right": 2, "closed": "both"},
TypeError,
r"Expecting left to be an int for an interval over the integers",
),
(
{"type": Integral, "left": 1, "right": 2.0, "closed": "neither"},
TypeError,
"Expecting right to be an int for an interval over the integers",
),
(
{"type": Integral, "left": None, "right": 0, "closed": "left"},
ValueError,
r"left can't be None when closed == left",
),
(
{"type": Integral, "left": 0, "right": None, "closed": "right"},
ValueError,
r"right can't be None when closed == right",
),
(
{"type": Integral, "left": 1, "right": -1, "closed": "both"},
ValueError,
r"right can't be less than left",
),
],
)
def test_interval_errors(params, error, match):
"""Check that informative errors are raised for invalid combination of parameters"""
with pytest.raises(error, match=match):
Interval(**params)
def test_stroptions():
"""Sanity check for the StrOptions constraint"""
options = StrOptions({"a", "b", "c"}, deprecated={"c"})
assert options.is_satisfied_by("a")
assert options.is_satisfied_by("c")
assert not options.is_satisfied_by("d")
assert "'c' (deprecated)" in str(options)
def test_options():
"""Sanity check for the Options constraint"""
options = Options(Real, {-0.5, 0.5, np.inf}, deprecated={-0.5})
assert options.is_satisfied_by(-0.5)
assert options.is_satisfied_by(np.inf)
assert not options.is_satisfied_by(1.23)
assert "-0.5 (deprecated)" in str(options)
@pytest.mark.parametrize(
"type, expected_type_name",
[
(int, "int"),
(Integral, "int"),
(Real, "float"),
(np.ndarray, "numpy.ndarray"),
],
)
def test_instances_of_type_human_readable(type, expected_type_name):
"""Check the string representation of the _InstancesOf constraint."""
constraint = _InstancesOf(type)
assert str(constraint) == f"an instance of '{expected_type_name}'"
def test_hasmethods():
"""Check the HasMethods constraint."""
constraint = HasMethods(["a", "b"])
class _Good:
def a(self):
pass # pragma: no cover
def b(self):
pass # pragma: no cover
class _Bad:
def a(self):
pass # pragma: no cover
assert constraint.is_satisfied_by(_Good())
assert not constraint.is_satisfied_by(_Bad())
assert str(constraint) == "an object implementing 'a' and 'b'"
@pytest.mark.parametrize(
"constraint",
[
Interval(Real, None, 0, closed="left"),
Interval(Real, 0, None, closed="left"),
Interval(Real, None, None, closed="neither"),
StrOptions({"a", "b", "c"}),
MissingValues(),
MissingValues(numeric_only=True),
_VerboseHelper(),
HasMethods("fit"),
_IterablesNotString(),
_CVObjects(),
],
)
def test_generate_invalid_param_val(constraint):
"""Check that the value generated does not satisfy the constraint"""
bad_value = generate_invalid_param_val(constraint)
assert not constraint.is_satisfied_by(bad_value)
@pytest.mark.parametrize(
"integer_interval, real_interval",
[
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, -5, 5, closed="both"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, -5, 5, closed="neither"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 4, 5, closed="both"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 5, None, closed="left"),
),
(
Interval(Integral, None, 3, closed="right"),
Interval(RealNotInt, 4, None, closed="neither"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, -5, 5, closed="both"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, -5, 5, closed="neither"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, 1, 2, closed="both"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, None, -5, closed="left"),
),
(
Interval(Integral, 3, None, closed="left"),
Interval(RealNotInt, None, -4, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, None, 1, closed="right"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, 1, None, closed="left"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, -10, -4, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="both"),
Interval(RealNotInt, -10, -4, closed="right"),
),
(
Interval(Integral, -5, 5, closed="neither"),
Interval(RealNotInt, 6, 10, closed="neither"),
),
(
Interval(Integral, -5, 5, closed="neither"),
Interval(RealNotInt, 6, 10, closed="left"),
),
(
Interval(Integral, 2, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="both"),
),
(
Interval(Integral, 1, None, closed="left"),
Interval(RealNotInt, 0, 1, closed="both"),
),
],
)
def test_generate_invalid_param_val_2_intervals(integer_interval, real_interval):
"""Check that the value generated for an interval constraint does not satisfy any of
the interval constraints.
"""
bad_value = generate_invalid_param_val(constraint=real_interval)
assert not real_interval.is_satisfied_by(bad_value)
assert not integer_interval.is_satisfied_by(bad_value)
bad_value = generate_invalid_param_val(constraint=integer_interval)
assert not real_interval.is_satisfied_by(bad_value)
assert not integer_interval.is_satisfied_by(bad_value)
@pytest.mark.parametrize(
"constraint",
[
_ArrayLikes(),
_InstancesOf(list),
_Callables(),
_NoneConstraint(),
_RandomStates(),
_SparseMatrices(),
_Booleans(),
Interval(Integral, None, None, closed="neither"),
],
)
def test_generate_invalid_param_val_all_valid(constraint):
"""Check that the function raises NotImplementedError when there's no invalid value
for the constraint.
"""
with pytest.raises(NotImplementedError):
generate_invalid_param_val(constraint)
@pytest.mark.parametrize(
"constraint",
[
_ArrayLikes(),
_Callables(),
_InstancesOf(list),
_NoneConstraint(),
_RandomStates(),
_SparseMatrices(),
_Booleans(),
_VerboseHelper(),
MissingValues(),
MissingValues(numeric_only=True),
StrOptions({"a", "b", "c"}),
Options(Integral, {1, 2, 3}),
Interval(Integral, None, None, closed="neither"),
Interval(Integral, 0, 10, closed="neither"),
Interval(Integral, 0, None, closed="neither"),
Interval(Integral, None, 0, closed="neither"),
Interval(Real, 0, 1, closed="neither"),
Interval(Real, 0, None, closed="both"),
Interval(Real, None, 0, closed="right"),
HasMethods("fit"),
_IterablesNotString(),
_CVObjects(),
],
)
def test_generate_valid_param(constraint):
"""Check that the value generated does satisfy the constraint."""
value = generate_valid_param(constraint)
assert constraint.is_satisfied_by(value)
@pytest.mark.parametrize(
"constraint_declaration, value",
[
(Interval(Real, 0, 1, closed="both"), 0.42),
(Interval(Integral, 0, None, closed="neither"), 42),
(StrOptions({"a", "b", "c"}), "b"),
(Options(type, {np.float32, np.float64}), np.float64),
(callable, lambda x: x + 1),
(None, None),
("array-like", [[1, 2], [3, 4]]),
("array-like", np.array([[1, 2], [3, 4]])),
("sparse matrix", csr_matrix([[1, 2], [3, 4]])),
*[
("sparse matrix", container([[1, 2], [3, 4]]))
for container in CSR_CONTAINERS
],
("random_state", 0),
("random_state", np.random.RandomState(0)),
("random_state", None),
(_Class, _Class()),
(int, 1),
(Real, 0.5),
("boolean", False),
("verbose", 1),
("nan", np.nan),
(MissingValues(), -1),
(MissingValues(), -1.0),
(MissingValues(), 2**1028),
(MissingValues(), None),
(MissingValues(), float("nan")),
(MissingValues(), np.nan),
(MissingValues(), "missing"),
(HasMethods("fit"), _Estimator(a=0)),
("cv_object", 5),
],
)
def test_is_satisfied_by(constraint_declaration, value):
"""Sanity check for the is_satisfied_by method"""
constraint = make_constraint(constraint_declaration)
assert constraint.is_satisfied_by(value)
@pytest.mark.parametrize(
"constraint_declaration, expected_constraint_class",
[
(Interval(Real, 0, 1, closed="both"), Interval),
(StrOptions({"option1", "option2"}), StrOptions),
(Options(Real, {0.42, 1.23}), Options),
("array-like", _ArrayLikes),
("sparse matrix", _SparseMatrices),
("random_state", _RandomStates),
(None, _NoneConstraint),
(callable, _Callables),
(int, _InstancesOf),
("boolean", _Booleans),
("verbose", _VerboseHelper),
(MissingValues(numeric_only=True), MissingValues),
(HasMethods("fit"), HasMethods),
("cv_object", _CVObjects),
("nan", _NanConstraint),
],
)
def test_make_constraint(constraint_declaration, expected_constraint_class):
"""Check that make_constraint dispatches to the appropriate constraint class"""
constraint = make_constraint(constraint_declaration)
assert constraint.__class__ is expected_constraint_class
def test_make_constraint_unknown():
"""Check that an informative error is raised when an unknown constraint is passed"""
with pytest.raises(ValueError, match="Unknown constraint"):
make_constraint("not a valid constraint")
def test_validate_params():
"""Check that validate_params works no matter how the arguments are passed"""
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _func must be"
):
_func("wrong", c=1)
with pytest.raises(
InvalidParameterError, match="The 'b' parameter of _func must be"
):
_func(*[1, "wrong"], c=1)
with pytest.raises(
InvalidParameterError, match="The 'c' parameter of _func must be"
):
_func(1, **{"c": "wrong"})
with pytest.raises(
InvalidParameterError, match="The 'd' parameter of _func must be"
):
_func(1, c=1, d="wrong")
# check in the presence of extra positional and keyword args
with pytest.raises(
InvalidParameterError, match="The 'b' parameter of _func must be"
):
_func(0, *["wrong", 2, 3], c=4, **{"e": 5})
with pytest.raises(
InvalidParameterError, match="The 'c' parameter of _func must be"
):
_func(0, *[1, 2, 3], c="four", **{"e": 5})
def test_validate_params_missing_params():
"""Check that no error is raised when there are parameters without
constraints
"""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def func(a, b):
pass
func(1, 2)
def test_decorate_validated_function():
"""Check that validate_params functions can be decorated"""
decorated_function = deprecated()(_func)
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
decorated_function(1, 2, c=3)
# outer decorator does not interfere with validation
with pytest.warns(FutureWarning, match="Function _func is deprecated"):
with pytest.raises(
InvalidParameterError, match=r"The 'c' parameter of _func must be"
):
decorated_function(1, 2, c="wrong")
def test_validate_params_method():
"""Check that validate_params works with methods"""
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _Class._method must be"
):
_Class()._method("wrong")
# validated method can be decorated
with pytest.warns(FutureWarning, match="Function _deprecated_method is deprecated"):
with pytest.raises(
InvalidParameterError,
match="The 'a' parameter of _Class._deprecated_method must be",
):
_Class()._deprecated_method("wrong")
def test_validate_params_estimator():
"""Check that validate_params works with Estimator instances"""
# no validation in init
est = _Estimator("wrong")
with pytest.raises(
InvalidParameterError, match="The 'a' parameter of _Estimator must be"
):
est.fit()
def test_stroptions_deprecated_subset():
"""Check that the deprecated parameter must be a subset of options."""
with pytest.raises(ValueError, match="deprecated options must be a subset"):
StrOptions({"a", "b", "c"}, deprecated={"a", "d"})
def test_hidden_constraint():
"""Check that internal constraints are not exposed in the error message."""
@validate_params(
{"param": [Hidden(list), dict]}, prefer_skip_nested_validation=True
)
def f(param):
pass
# list and dict are valid params
f({"a": 1, "b": 2, "c": 3})
f([1, 2, 3])
with pytest.raises(
InvalidParameterError, match="The 'param' parameter"
) as exc_info:
f(param="bad")
# the list option is not exposed in the error message
err_msg = str(exc_info.value)
assert "an instance of 'dict'" in err_msg
assert "an instance of 'list'" not in err_msg
def test_hidden_stroptions():
"""Check that we can have 2 StrOptions constraints, one being hidden."""
@validate_params(
{"param": [StrOptions({"auto"}), Hidden(StrOptions({"warn"}))]},
prefer_skip_nested_validation=True,
)
def f(param):
pass
# "auto" and "warn" are valid params
f("auto")
f("warn")
with pytest.raises(
InvalidParameterError, match="The 'param' parameter"
) as exc_info:
f(param="bad")
# the "warn" option is not exposed in the error message
err_msg = str(exc_info.value)
assert "auto" in err_msg
assert "warn" not in err_msg
def test_validate_params_set_param_constraints_attribute():
"""Check that the validate_params decorator properly sets the parameter constraints
as attribute of the decorated function/method.
"""
assert hasattr(_func, "_skl_parameter_constraints")
assert hasattr(_Class()._method, "_skl_parameter_constraints")
def test_boolean_constraint_deprecated_int():
"""Check that validate_params raise a deprecation message but still passes
validation when using an int for a parameter accepting a boolean.
"""
@validate_params({"param": ["boolean"]}, prefer_skip_nested_validation=True)
def f(param):
pass
# True/False and np.bool_(True/False) are valid params
f(True)
f(np.bool_(False))
def test_no_validation():
"""Check that validation can be skipped for a parameter."""
@validate_params(
{"param1": [int, None], "param2": "no_validation"},
prefer_skip_nested_validation=True,
)
def f(param1=None, param2=None):
pass
# param1 is validated
with pytest.raises(InvalidParameterError, match="The 'param1' parameter"):
f(param1="wrong")
# param2 is not validated: any type is valid.
class SomeType:
pass
f(param2=SomeType)
f(param2=SomeType())
def test_pandas_na_constraint_with_pd_na():
"""Add a specific test for checking support for `pandas.NA`."""
pd = pytest.importorskip("pandas")
na_constraint = _PandasNAConstraint()
assert na_constraint.is_satisfied_by(pd.NA)
assert not na_constraint.is_satisfied_by(np.array([1, 2, 3]))
def test_iterable_not_string():
"""Check that a string does not satisfy the _IterableNotString constraint."""
constraint = _IterablesNotString()
assert constraint.is_satisfied_by([1, 2, 3])
assert constraint.is_satisfied_by(range(10))
assert not constraint.is_satisfied_by("some string")
def test_cv_objects():
"""Check that the _CVObjects constraint accepts all current ways
to pass cv objects."""
constraint = _CVObjects()
assert constraint.is_satisfied_by(5)
assert constraint.is_satisfied_by(LeaveOneOut())
assert constraint.is_satisfied_by([([1, 2], [3, 4]), ([3, 4], [1, 2])])
assert constraint.is_satisfied_by(None)
assert not constraint.is_satisfied_by("not a CV object")
def test_third_party_estimator():
"""Check that the validation from a scikit-learn estimator inherited by a third
party estimator does not impose a match between the dict of constraints and the
parameters of the estimator.
"""
class ThirdPartyEstimator(_Estimator):
def __init__(self, b):
self.b = b
super().__init__(a=0)
def fit(self, X=None, y=None):
super().fit(X, y)
# does not raise, even though "b" is not in the constraints dict and "a" is not
# a parameter of the estimator.
ThirdPartyEstimator(b=0).fit()
def test_interval_real_not_int():
"""Check for the type RealNotInt in the Interval constraint."""
constraint = Interval(RealNotInt, 0, 1, closed="both")
assert constraint.is_satisfied_by(1.0)
assert not constraint.is_satisfied_by(1)
def test_real_not_int():
"""Check for the RealNotInt type."""
assert isinstance(1.0, RealNotInt)
assert not isinstance(1, RealNotInt)
assert isinstance(np.float64(1), RealNotInt)
assert not isinstance(np.int64(1), RealNotInt)
def test_skip_param_validation():
"""Check that param validation can be skipped using config_context."""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def f(a):
pass
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
f(a="1")
# does not raise
with config_context(skip_parameter_validation=True):
f(a="1")
@pytest.mark.parametrize("prefer_skip_nested_validation", [True, False])
def test_skip_nested_validation(prefer_skip_nested_validation):
"""Check that nested validation can be skipped."""
@validate_params({"a": [int]}, prefer_skip_nested_validation=True)
def f(a):
pass
@validate_params(
{"b": [int]},
prefer_skip_nested_validation=prefer_skip_nested_validation,
)
def g(b):
# calls f with a bad parameter type
return f(a="invalid_param_value")
# Validation for g is never skipped.
with pytest.raises(InvalidParameterError, match="The 'b' parameter"):
g(b="invalid_param_value")
if prefer_skip_nested_validation:
g(b=1) # does not raise because inner f is not validated
else:
with pytest.raises(InvalidParameterError, match="The 'a' parameter"):
g(b=1)
@pytest.mark.parametrize(
"skip_parameter_validation, prefer_skip_nested_validation, expected_skipped",
[
(True, True, True),
(True, False, True),
(False, True, True),
(False, False, False),
],
)
def test_skip_nested_validation_and_config_context(
skip_parameter_validation, prefer_skip_nested_validation, expected_skipped
):
"""Check interaction between global skip and local skip."""
@validate_params(
{"a": [int]}, prefer_skip_nested_validation=prefer_skip_nested_validation
)
def g(a):
return get_config()["skip_parameter_validation"]
with config_context(skip_parameter_validation=skip_parameter_validation):
actual_skipped = g(1)
assert actual_skipped == expected_skipped

View File

@@ -0,0 +1,63 @@
import numpy as np
import pytest
from sklearn.utils._plotting import _interval_max_min_ratio, _validate_score_name
def metric():
pass # pragma: no cover
def neg_metric():
pass # pragma: no cover
@pytest.mark.parametrize(
"score_name, scoring, negate_score, expected_score_name",
[
("accuracy", None, False, "accuracy"), # do not transform the name
(None, "accuracy", False, "Accuracy"), # capitalize the name
(None, "accuracy", True, "Negative accuracy"), # add "Negative"
(None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
(None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_"
("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name
(None, None, False, "Score"), # default name
(None, None, True, "Negative score"), # default name but negated
("Some metric", metric, False, "Some metric"), # do not transform the name
("Some metric", metric, True, "Some metric"), # do not transform the name
(None, metric, False, "Metric"), # default name
(None, metric, True, "Negative metric"), # default name but negated
("Some metric", neg_metric, False, "Some metric"), # do not transform the name
("Some metric", neg_metric, True, "Some metric"), # do not transform the name
(None, neg_metric, False, "Negative metric"), # default name
(None, neg_metric, True, "Metric"), # default name but negated
],
)
def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
"""Check that we return the right score name."""
assert (
_validate_score_name(score_name, scoring, negate_score) == expected_score_name
)
# In the following test, we check the value of the max to min ratio
# for parameter value intervals to check that using a decision threshold
# of 5. is a good heuristic to decide between linear and log scales on
# common ranges of parameter values.
@pytest.mark.parametrize(
"data, lower_bound, upper_bound",
[
# Such a range could be clearly displayed with either log scale or linear
# scale.
(np.geomspace(0.1, 1, 5), 5, 6),
# Checking that the ratio is still positive on a negative log scale.
(-np.geomspace(0.1, 1, 10), 7, 8),
# Evenly spaced parameter values lead to a ratio of 1.
(np.linspace(0, 1, 5), 0.9, 1.1),
# This is not exactly spaced on a log scale but we will benefit from treating
# it as such for visualization.
([1, 2, 5, 10, 20, 50], 20, 40),
],
)
def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
assert lower_bound < _interval_max_min_ratio(data) < upper_bound

View File

@@ -0,0 +1,680 @@
import re
from pprint import PrettyPrinter
import numpy as np
from sklearn.utils._pprint import _EstimatorPrettyPrinter
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import config_context
# Ignore flake8 (lots of line too long issues)
# ruff: noqa
# Constructors excerpted to test pprinting
class LogisticRegression(BaseEstimator):
def __init__(
self,
penalty="l2",
dual=False,
tol=1e-4,
C=1.0,
fit_intercept=True,
intercept_scaling=1,
class_weight=None,
random_state=None,
solver="warn",
max_iter=100,
multi_class="warn",
verbose=0,
warm_start=False,
n_jobs=None,
l1_ratio=None,
):
self.penalty = penalty
self.dual = dual
self.tol = tol
self.C = C
self.fit_intercept = fit_intercept
self.intercept_scaling = intercept_scaling
self.class_weight = class_weight
self.random_state = random_state
self.solver = solver
self.max_iter = max_iter
self.multi_class = multi_class
self.verbose = verbose
self.warm_start = warm_start
self.n_jobs = n_jobs
self.l1_ratio = l1_ratio
def fit(self, X, y):
return self
class StandardScaler(TransformerMixin, BaseEstimator):
def __init__(self, copy=True, with_mean=True, with_std=True):
self.with_mean = with_mean
self.with_std = with_std
self.copy = copy
def transform(self, X, copy=None):
return self
class RFE(BaseEstimator):
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
self.estimator = estimator
self.n_features_to_select = n_features_to_select
self.step = step
self.verbose = verbose
class GridSearchCV(BaseEstimator):
def __init__(
self,
estimator,
param_grid,
scoring=None,
n_jobs=None,
iid="warn",
refit=True,
cv="warn",
verbose=0,
pre_dispatch="2*n_jobs",
error_score="raise-deprecating",
return_train_score=False,
):
self.estimator = estimator
self.param_grid = param_grid
self.scoring = scoring
self.n_jobs = n_jobs
self.iid = iid
self.refit = refit
self.cv = cv
self.verbose = verbose
self.pre_dispatch = pre_dispatch
self.error_score = error_score
self.return_train_score = return_train_score
class CountVectorizer(BaseEstimator):
def __init__(
self,
input="content",
encoding="utf-8",
decode_error="strict",
strip_accents=None,
lowercase=True,
preprocessor=None,
tokenizer=None,
stop_words=None,
token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1),
analyzer="word",
max_df=1.0,
min_df=1,
max_features=None,
vocabulary=None,
binary=False,
dtype=np.int64,
):
self.input = input
self.encoding = encoding
self.decode_error = decode_error
self.strip_accents = strip_accents
self.preprocessor = preprocessor
self.tokenizer = tokenizer
self.analyzer = analyzer
self.lowercase = lowercase
self.token_pattern = token_pattern
self.stop_words = stop_words
self.max_df = max_df
self.min_df = min_df
self.max_features = max_features
self.ngram_range = ngram_range
self.vocabulary = vocabulary
self.binary = binary
self.dtype = dtype
class Pipeline(BaseEstimator):
def __init__(self, steps, memory=None):
self.steps = steps
self.memory = memory
class SVC(BaseEstimator):
def __init__(
self,
C=1.0,
kernel="rbf",
degree=3,
gamma="auto_deprecated",
coef0=0.0,
shrinking=True,
probability=False,
tol=1e-3,
cache_size=200,
class_weight=None,
verbose=False,
max_iter=-1,
decision_function_shape="ovr",
random_state=None,
):
self.kernel = kernel
self.degree = degree
self.gamma = gamma
self.coef0 = coef0
self.tol = tol
self.C = C
self.shrinking = shrinking
self.probability = probability
self.cache_size = cache_size
self.class_weight = class_weight
self.verbose = verbose
self.max_iter = max_iter
self.decision_function_shape = decision_function_shape
self.random_state = random_state
class PCA(BaseEstimator):
def __init__(
self,
n_components=None,
copy=True,
whiten=False,
svd_solver="auto",
tol=0.0,
iterated_power="auto",
random_state=None,
):
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
self.random_state = random_state
class NMF(BaseEstimator):
def __init__(
self,
n_components=None,
init=None,
solver="cd",
beta_loss="frobenius",
tol=1e-4,
max_iter=200,
random_state=None,
alpha=0.0,
l1_ratio=0.0,
verbose=0,
shuffle=False,
):
self.n_components = n_components
self.init = init
self.solver = solver
self.beta_loss = beta_loss
self.tol = tol
self.max_iter = max_iter
self.random_state = random_state
self.alpha = alpha
self.l1_ratio = l1_ratio
self.verbose = verbose
self.shuffle = shuffle
class SimpleImputer(BaseEstimator):
def __init__(
self,
missing_values=np.nan,
strategy="mean",
fill_value=None,
verbose=0,
copy=True,
):
self.missing_values = missing_values
self.strategy = strategy
self.fill_value = fill_value
self.verbose = verbose
self.copy = copy
def test_basic(print_changed_only_false):
# Basic pprint test
lr = LogisticRegression()
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
def test_changed_only():
# Make sure the changed_only param is correctly used when True (default)
lr = LogisticRegression(C=99)
expected = """LogisticRegression(C=99)"""
assert lr.__repr__() == expected
# Check with a repr that doesn't fit on a single line
lr = LogisticRegression(
C=99, class_weight=0.4, fit_intercept=False, tol=1234, verbose=True
)
expected = """
LogisticRegression(C=99, class_weight=0.4, fit_intercept=False, tol=1234,
verbose=True)"""
expected = expected[1:] # remove first \n
assert lr.__repr__() == expected
imputer = SimpleImputer(missing_values=0)
expected = """SimpleImputer(missing_values=0)"""
assert imputer.__repr__() == expected
# Defaults to np.nan, trying with float('NaN')
imputer = SimpleImputer(missing_values=float("NaN"))
expected = """SimpleImputer()"""
assert imputer.__repr__() == expected
# make sure array parameters don't throw error (see #13583)
repr(LogisticRegressionCV(Cs=np.array([0.1, 1])))
def test_pipeline(print_changed_only_false):
# Render a pipeline object
pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=999))
expected = """
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('logisticregression',
LogisticRegression(C=999, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001, verbose=0,
warm_start=False))],
verbose=False)"""
expected = expected[1:] # remove first \n
assert pipeline.__repr__() == expected
def test_deeply_nested(print_changed_only_false):
# Render a deeply nested estimator
rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression())))))))
expected = """
RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0,
class_weight=None,
dual=False,
fit_intercept=True,
intercept_scaling=1,
l1_ratio=None,
max_iter=100,
multi_class='warn',
n_jobs=None,
penalty='l2',
random_state=None,
solver='warn',
tol=0.0001,
verbose=0,
warm_start=False),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1,
verbose=0),
n_features_to_select=None,
step=1, verbose=0),
n_features_to_select=None, step=1,
verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0),
n_features_to_select=None, step=1, verbose=0)"""
expected = expected[1:] # remove first \n
assert rfe.__repr__() == expected
def test_gridsearch(print_changed_only_false):
# render a gridsearch
param_grid = [
{"kernel": ["rbf"], "gamma": [1e-3, 1e-4], "C": [1, 10, 100, 1000]},
{"kernel": ["linear"], "C": [1, 10, 100, 1000]},
]
gs = GridSearchCV(SVC(), param_grid, cv=5)
expected = """
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid=[{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
'kernel': ['rbf']},
{'C': [1, 10, 100, 1000], 'kernel': ['linear']}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert gs.__repr__() == expected
def test_gridsearch_pipeline(print_changed_only_false):
# render a pipeline inside a gridsearch
pp = _EstimatorPrettyPrinter(compact=True, indent=1, indent_at_name=True)
pipeline = Pipeline([("reduce_dim", PCA()), ("classify", SVC())])
N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
{
"reduce_dim": [PCA(iterated_power=7), NMF()],
"reduce_dim__n_components": N_FEATURES_OPTIONS,
"classify__C": C_OPTIONS,
},
{
"reduce_dim": [SelectKBest(chi2)],
"reduce_dim__k": N_FEATURES_OPTIONS,
"classify__C": C_OPTIONS,
},
]
gspipline = GridSearchCV(pipeline, cv=3, n_jobs=1, param_grid=param_grid)
expected = """
GridSearchCV(cv=3, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('reduce_dim',
PCA(copy=True, iterated_power='auto',
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False)),
('classify',
SVC(C=1.0, cache_size=200,
class_weight=None, coef0=0.0,
decision_function_shape='ovr',
degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1,
probability=False,
random_state=None, shrinking=True,
tol=0.001, verbose=False))]),
iid='warn', n_jobs=1,
param_grid=[{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [PCA(copy=True, iterated_power=7,
n_components=None,
random_state=None,
svd_solver='auto', tol=0.0,
whiten=False),
NMF(alpha=0.0, beta_loss='frobenius',
init=None, l1_ratio=0.0,
max_iter=200, n_components=None,
random_state=None, shuffle=False,
solver='cd', tol=0.0001,
verbose=0)],
'reduce_dim__n_components': [2, 4, 8]},
{'classify__C': [1, 10, 100, 1000],
'reduce_dim': [SelectKBest(k=10,
score_func=<function chi2 at some_address>)],
'reduce_dim__k': [2, 4, 8]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
repr_ = pp.pformat(gspipline)
# Remove address of '<function chi2 at 0x.....>' for reproducibility
repr_ = re.sub("function chi2 at 0x.*>", "function chi2 at some_address>", repr_)
assert repr_ == expected
def test_n_max_elements_to_show(print_changed_only_false):
n_max_elements_to_show = 30
pp = _EstimatorPrettyPrinter(
compact=True,
indent=1,
indent_at_name=True,
n_max_elements_to_show=n_max_elements_to_show,
)
# No ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Now with ellipsis
vocabulary = {i: i for i in range(n_max_elements_to_show + 1)}
vectorizer = CountVectorizer(vocabulary=vocabulary)
expected = r"""
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None,
vocabulary={0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7,
8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14,
15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20,
21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26,
27: 27, 28: 28, 29: 29, ...})"""
expected = expected[1:] # remove first \n
assert pp.pformat(vectorizer) == expected
# Also test with lists
param_grid = {"C": list(range(n_max_elements_to_show))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
# Now with ellipsis
param_grid = {"C": list(range(n_max_elements_to_show + 1))}
gs = GridSearchCV(SVC(), param_grid)
expected = """
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3,
gamma='auto_deprecated', kernel='rbf', max_iter=-1,
probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
iid='warn', n_jobs=None,
param_grid={'C': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, ...]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)"""
expected = expected[1:] # remove first \n
assert pp.pformat(gs) == expected
def test_bruteforce_ellipsis(print_changed_only_false):
# Check that the bruteforce ellipsis (used when the number of non-blank
# characters exceeds N_CHAR_MAX) renders correctly.
lr = LogisticRegression()
# test when the left and right side of the ellipsis aren't on the same
# line.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
in...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=150)
# test with very small N_CHAR_MAX
# Note that N_CHAR_MAX is not strictly enforced, but it's normal: to avoid
# weird reprs we still keep the whole line of the right part (after the
# ellipsis).
expected = """
Lo...
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=4)
# test with N_CHAR_MAX == number of non-blank characters: In this case we
# don't want ellipsis
full_repr = lr.__repr__(N_CHAR_MAX=float("inf"))
n_nonblank = len("".join(full_repr.split()))
assert lr.__repr__(N_CHAR_MAX=n_nonblank) == full_repr
assert "..." not in full_repr
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on different lines. In this case we
# want to expend the whole line of the right side
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_i...
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 10)
# test with N_CHAR_MAX == number of non-blank characters - 10: the left and
# right side of the ellispsis are on the same line. In this case we don't
# want to expend the whole line of the right side, just add the ellispsis
# between the 2 sides.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter...,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 4)
# test with N_CHAR_MAX == number of non-blank characters - 2: the left and
# right side of the ellispsis are on the same line, but adding the ellipsis
# would actually make the repr longer. So we don't add the ellipsis.
expected = """
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='warn', tol=0.0001, verbose=0,
warm_start=False)"""
expected = expected[1:] # remove first \n
assert expected == lr.__repr__(N_CHAR_MAX=n_nonblank - 2)
def test_builtin_prettyprinter():
# non regression test than ensures we can still use the builtin
# PrettyPrinter class for estimators (as done e.g. by joblib).
# Used to be a bug
PrettyPrinter().pprint(LogisticRegression())
def test_kwargs_in_init():
# Make sure the changed_only=True mode is OK when an argument is passed as
# kwargs.
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/17206
class WithKWargs(BaseEstimator):
# Estimator with a kwargs argument. These need to hack around
# set_params and get_params. Here we mimic what LightGBM does.
def __init__(self, a="willchange", b="unchanged", **kwargs):
self.a = a
self.b = b
self._other_params = {}
self.set_params(**kwargs)
def get_params(self, deep=True):
params = super().get_params(deep=deep)
params.update(self._other_params)
return params
def set_params(self, **params):
for key, value in params.items():
setattr(self, key, value)
self._other_params[key] = value
return self
est = WithKWargs(a="something", c="abcd", d=None)
expected = "WithKWargs(a='something', c='abcd', d=None)"
assert expected == est.__repr__()
with config_context(print_changed_only=False):
expected = "WithKWargs(a='something', b='unchanged', c='abcd', d=None)"
assert expected == est.__repr__()
def test_complexity_print_changed_only():
# Make sure `__repr__` is called the same amount of times
# whether `print_changed_only` is True or False
# Non-regression test for
# https://github.com/scikit-learn/scikit-learn/issues/18490
class DummyEstimator(TransformerMixin, BaseEstimator):
nb_times_repr_called = 0
def __init__(self, estimator=None):
self.estimator = estimator
def __repr__(self):
DummyEstimator.nb_times_repr_called += 1
return super().__repr__()
def transform(self, X, copy=None): # pragma: no cover
return X
estimator = DummyEstimator(
make_pipeline(DummyEstimator(DummyEstimator()), DummyEstimator(), "passthrough")
)
with config_context(print_changed_only=False):
repr(estimator)
nb_repr_print_changed_only_false = DummyEstimator.nb_times_repr_called
DummyEstimator.nb_times_repr_called = 0
with config_context(print_changed_only=True):
repr(estimator)
nb_repr_print_changed_only_true = DummyEstimator.nb_times_repr_called
assert nb_repr_print_changed_only_false == nb_repr_print_changed_only_true

View File

@@ -0,0 +1,192 @@
import numpy as np
import pytest
import scipy.sparse as sp
from numpy.testing import assert_array_almost_equal
from scipy.special import comb
from sklearn.utils._random import _our_rand_r_py
from sklearn.utils.random import _random_choice_csc, sample_without_replacement
###############################################################################
# test custom sampling without replacement algorithm
###############################################################################
def test_invalid_sample_without_replacement_algorithm():
with pytest.raises(ValueError):
sample_without_replacement(5, 4, "unknown")
def test_sample_without_replacement_algorithms():
methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
for m in methods:
def sample_without_replacement_method(
n_population, n_samples, random_state=None
):
return sample_without_replacement(
n_population, n_samples, method=m, random_state=random_state
)
check_edge_case_of_sample_int(sample_without_replacement_method)
check_sample_int(sample_without_replacement_method)
check_sample_int_distribution(sample_without_replacement_method)
def check_edge_case_of_sample_int(sample_without_replacement):
# n_population < n_sample
with pytest.raises(ValueError):
sample_without_replacement(0, 1)
with pytest.raises(ValueError):
sample_without_replacement(1, 2)
# n_population == n_samples
assert sample_without_replacement(0, 0).shape == (0,)
assert sample_without_replacement(1, 1).shape == (1,)
# n_population >= n_samples
assert sample_without_replacement(5, 0).shape == (0,)
assert sample_without_replacement(5, 1).shape == (1,)
# n_population < 0 or n_samples < 0
with pytest.raises(ValueError):
sample_without_replacement(-1, 5)
with pytest.raises(ValueError):
sample_without_replacement(5, -1)
def check_sample_int(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# the sample is of the correct length and contains only unique items
n_population = 100
for n_samples in range(n_population + 1):
s = sample_without_replacement(n_population, n_samples)
assert len(s) == n_samples
unique = np.unique(s)
assert np.size(unique) == n_samples
assert np.all(unique < n_population)
# test edge case n_population == n_samples == 0
assert np.size(sample_without_replacement(0, 0)) == 0
def check_sample_int_distribution(sample_without_replacement):
# This test is heavily inspired from test_random.py of python-core.
#
# For the entire allowable range of 0 <= k <= N, validate that
# sample generates all possible permutations
n_population = 10
# a large number of trials prevents false negatives without slowing normal
# case
n_trials = 10000
for n_samples in range(n_population):
# Counting the number of combinations is not as good as counting the
# the number of permutations. However, it works with sampling algorithm
# that does not provide a random permutation of the subset of integer.
n_expected = comb(n_population, n_samples, exact=True)
output = {}
for i in range(n_trials):
output[frozenset(sample_without_replacement(n_population, n_samples))] = (
None
)
if len(output) == n_expected:
break
else:
raise AssertionError(
"number of combinations != number of expected (%s != %s)"
% (len(output), n_expected)
)
def test_random_choice_csc(n_samples=10000, random_state=24):
# Explicit class probabilities
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Implicit class probabilities
classes = [[0, 1], [1, 2]] # test for array-like support
class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
got = _random_choice_csc(
n_samples=n_samples, classes=classes, random_state=random_state
)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# Edge case probabilities 1.0 and 0.0
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
assert sp.issparse(got)
for k in range(len(classes)):
p = (
np.bincount(
got.getcol(k).toarray().ravel(), minlength=len(class_probabilities[k])
)
/ n_samples
)
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
# One class target data
classes = [[1], [0]] # test for array-like support
class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
got = _random_choice_csc(
n_samples=n_samples, classes=classes, random_state=random_state
)
assert sp.issparse(got)
for k in range(len(classes)):
p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
assert_array_almost_equal(class_probabilities[k], p, decimal=1)
def test_random_choice_csc_errors():
# the length of an array in classes and class_probabilities is mismatched
classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# the class dtype is not supported
classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
# Given probabilities don't sum to 1
classes = [np.array([0, 1]), np.array([0, 1, 2])]
class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
with pytest.raises(ValueError):
_random_choice_csc(4, classes, class_probabilities, 1)
def test_our_rand_r():
assert 131541053 == _our_rand_r_py(1273642419)
assert 270369 == _our_rand_r_py(0)

View File

@@ -0,0 +1,371 @@
import numpy as np
import pytest
from sklearn.datasets import (
load_iris,
make_classification,
make_multilabel_classification,
make_regression,
)
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import (
LinearRegression,
LogisticRegression,
)
from sklearn.multioutput import ClassifierChain
from sklearn.preprocessing import scale
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
from sklearn.utils._response import _get_response_values, _get_response_values_binary
from sklearn.utils._testing import assert_allclose, assert_array_equal
X, y = load_iris(return_X_y=True)
# scale the data to avoid ConvergenceWarning with LogisticRegression
X = scale(X, copy=False)
X_binary, y_binary = X[:100], y[:100]
@pytest.mark.parametrize(
"response_method", ["decision_function", "predict_proba", "predict_log_proba"]
)
def test_get_response_values_regressor_error(response_method):
"""Check the error message with regressor an not supported response
method."""
my_estimator = _MockEstimatorOnOffPrediction(response_methods=[response_method])
X = "mocking_data", "mocking_target"
err_msg = f"{my_estimator.__class__.__name__} should either be a classifier"
with pytest.raises(ValueError, match=err_msg):
_get_response_values(my_estimator, X, response_method=response_method)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_regressor(return_response_method_used):
"""Check the behaviour of `_get_response_values` with regressor."""
X, y = make_regression(n_samples=10, random_state=0)
regressor = LinearRegression().fit(X, y)
results = _get_response_values(
regressor,
X,
response_method="predict",
return_response_method_used=return_response_method_used,
)
assert_array_equal(results[0], regressor.predict(X))
assert results[1] is None
if return_response_method_used:
assert results[2] == "predict"
@pytest.mark.parametrize(
"response_method",
["predict", "decision_function", ["decision_function", "predict"]],
)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_outlier_detection(
response_method, return_response_method_used
):
"""Check the behaviour of `_get_response_values` with outlier detector."""
X, y = make_classification(n_samples=50, random_state=0)
outlier_detector = IsolationForest(random_state=0).fit(X, y)
results = _get_response_values(
outlier_detector,
X,
response_method=response_method,
return_response_method_used=return_response_method_used,
)
chosen_response_method = (
response_method[0] if isinstance(response_method, list) else response_method
)
prediction_method = getattr(outlier_detector, chosen_response_method)
assert_array_equal(results[0], prediction_method(X))
assert results[1] is None
if return_response_method_used:
assert results[2] == chosen_response_method
@pytest.mark.parametrize(
"response_method",
["predict_proba", "decision_function", "predict", "predict_log_proba"],
)
def test_get_response_values_classifier_unknown_pos_label(response_method):
"""Check that `_get_response_values` raises the proper error message with
classifier."""
X, y = make_classification(n_samples=10, n_classes=2, random_state=0)
classifier = LogisticRegression().fit(X, y)
# provide a `pos_label` which is not in `y`
err_msg = r"pos_label=whatever is not a valid label: It should be one of \[0 1\]"
with pytest.raises(ValueError, match=err_msg):
_get_response_values(
classifier,
X,
response_method=response_method,
pos_label="whatever",
)
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
def test_get_response_values_classifier_inconsistent_y_pred_for_binary_proba(
response_method,
):
"""Check that `_get_response_values` will raise an error when `y_pred` has a
single class with `predict_proba`."""
X, y_two_class = make_classification(n_samples=10, n_classes=2, random_state=0)
y_single_class = np.zeros_like(y_two_class)
classifier = DecisionTreeClassifier().fit(X, y_single_class)
err_msg = (
r"Got predict_proba of shape \(10, 1\), but need classifier with "
r"two classes"
)
with pytest.raises(ValueError, match=err_msg):
_get_response_values(classifier, X, response_method=response_method)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_values_binary_classifier_decision_function(
return_response_method_used,
):
"""Check the behaviour of `_get_response_values` with `decision_function`
and binary classifier."""
X, y = make_classification(
n_samples=10,
n_classes=2,
weights=[0.3, 0.7],
random_state=0,
)
classifier = LogisticRegression().fit(X, y)
response_method = "decision_function"
# default `pos_label`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=None,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X))
assert results[1] == 1
if return_response_method_used:
assert results[2] == "decision_function"
# when forcing `pos_label=classifier.classes_[0]`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=classifier.classes_[0],
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X) * -1)
assert results[1] == 0
if return_response_method_used:
assert results[2] == "decision_function"
@pytest.mark.parametrize("return_response_method_used", [True, False])
@pytest.mark.parametrize("response_method", ["predict_proba", "predict_log_proba"])
def test_get_response_values_binary_classifier_predict_proba(
return_response_method_used, response_method
):
"""Check that `_get_response_values` with `predict_proba` and binary
classifier."""
X, y = make_classification(
n_samples=10,
n_classes=2,
weights=[0.3, 0.7],
random_state=0,
)
classifier = LogisticRegression().fit(X, y)
# default `pos_label`
results = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=None,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], getattr(classifier, response_method)(X)[:, 1])
assert results[1] == 1
if return_response_method_used:
assert len(results) == 3
assert results[2] == response_method
else:
assert len(results) == 2
# when forcing `pos_label=classifier.classes_[0]`
y_pred, pos_label, *_ = _get_response_values(
classifier,
X,
response_method=response_method,
pos_label=classifier.classes_[0],
return_response_method_used=return_response_method_used,
)
assert_allclose(y_pred, getattr(classifier, response_method)(X)[:, 0])
assert pos_label == 0
@pytest.mark.parametrize(
"estimator, X, y, err_msg, params",
[
(
DecisionTreeRegressor(),
X_binary,
y_binary,
"Expected 'estimator' to be a binary classifier",
{"response_method": "auto"},
),
(
DecisionTreeClassifier(),
X_binary,
y_binary,
r"pos_label=unknown is not a valid label: It should be one of \[0 1\]",
{"response_method": "auto", "pos_label": "unknown"},
),
(
DecisionTreeClassifier(),
X,
y,
"be a binary classifier. Got 3 classes instead.",
{"response_method": "predict_proba"},
),
],
)
def test_get_response_error(estimator, X, y, err_msg, params):
"""Check that we raise the proper error messages in _get_response_values_binary."""
estimator.fit(X, y)
with pytest.raises(ValueError, match=err_msg):
_get_response_values_binary(estimator, X, **params)
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_predict_proba(return_response_method_used):
"""Check the behaviour of `_get_response_values_binary` using `predict_proba`."""
classifier = DecisionTreeClassifier().fit(X_binary, y_binary)
results = _get_response_values_binary(
classifier,
X_binary,
response_method="predict_proba",
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 1])
assert results[1] == 1
if return_response_method_used:
assert results[2] == "predict_proba"
results = _get_response_values_binary(
classifier,
X_binary,
response_method="predict_proba",
pos_label=0,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.predict_proba(X_binary)[:, 0])
assert results[1] == 0
if return_response_method_used:
assert results[2] == "predict_proba"
@pytest.mark.parametrize("return_response_method_used", [True, False])
def test_get_response_decision_function(return_response_method_used):
"""Check the behaviour of `_get_response_values_binary` using decision_function."""
classifier = LogisticRegression().fit(X_binary, y_binary)
results = _get_response_values_binary(
classifier,
X_binary,
response_method="decision_function",
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X_binary))
assert results[1] == 1
if return_response_method_used:
assert results[2] == "decision_function"
results = _get_response_values_binary(
classifier,
X_binary,
response_method="decision_function",
pos_label=0,
return_response_method_used=return_response_method_used,
)
assert_allclose(results[0], classifier.decision_function(X_binary) * -1)
assert results[1] == 0
if return_response_method_used:
assert results[2] == "decision_function"
@pytest.mark.parametrize(
"estimator, response_method",
[
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_proba"),
(DecisionTreeClassifier(max_depth=2, random_state=0), "predict_log_proba"),
(LogisticRegression(), "decision_function"),
],
)
def test_get_response_values_multiclass(estimator, response_method):
"""Check that we can call `_get_response_values` with a multiclass estimator.
It should return the predictions untouched.
"""
estimator.fit(X, y)
predictions, pos_label = _get_response_values(
estimator, X, response_method=response_method
)
assert pos_label is None
assert predictions.shape == (X.shape[0], len(estimator.classes_))
if response_method == "predict_proba":
assert np.logical_and(predictions >= 0, predictions <= 1).all()
elif response_method == "predict_log_proba":
assert (predictions <= 0.0).all()
def test_get_response_values_with_response_list():
"""Check the behaviour of passing a list of responses to `_get_response_values`."""
classifier = LogisticRegression().fit(X_binary, y_binary)
# it should use `predict_proba`
y_pred, pos_label, response_method = _get_response_values(
classifier,
X_binary,
response_method=["predict_proba", "decision_function"],
return_response_method_used=True,
)
assert_allclose(y_pred, classifier.predict_proba(X_binary)[:, 1])
assert pos_label == 1
assert response_method == "predict_proba"
# it should use `decision_function`
y_pred, pos_label, response_method = _get_response_values(
classifier,
X_binary,
response_method=["decision_function", "predict_proba"],
return_response_method_used=True,
)
assert_allclose(y_pred, classifier.decision_function(X_binary))
assert pos_label == 1
assert response_method == "decision_function"
@pytest.mark.parametrize(
"response_method", ["predict_proba", "decision_function", "predict"]
)
def test_get_response_values_multilabel_indicator(response_method):
X, Y = make_multilabel_classification(random_state=0)
estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
y_pred, pos_label = _get_response_values(
estimator, X, response_method=response_method
)
assert pos_label is None
assert y_pred.shape == Y.shape
if response_method == "predict_proba":
assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
elif response_method == "decision_function":
# values returned by `decision_function` are not bounded in [0, 1]
assert (y_pred < 0).sum() > 0
assert (y_pred > 1).sum() > 0
else: # response_method == "predict"
assert np.logical_or(y_pred == 0, y_pred == 1).all()

View File

@@ -0,0 +1,185 @@
# Author: Tom Dupre la Tour
# Joan Massich <mailsik@gmail.com>
#
# License: BSD 3 clause
from itertools import product
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn.datasets import load_iris
from sklearn.utils._seq_dataset import (
ArrayDataset32,
ArrayDataset64,
CSRDataset32,
CSRDataset64,
)
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSR_CONTAINERS
iris = load_iris()
X64 = iris.data.astype(np.float64)
y64 = iris.target.astype(np.float64)
sample_weight64 = np.arange(y64.size, dtype=np.float64)
X32 = iris.data.astype(np.float32)
y32 = iris.target.astype(np.float32)
sample_weight32 = np.arange(y32.size, dtype=np.float32)
floating = [np.float32, np.float64]
def assert_csr_equal_values(current, expected):
current.eliminate_zeros()
expected.eliminate_zeros()
expected = expected.astype(current.dtype)
assert current.shape[0] == expected.shape[0]
assert current.shape[1] == expected.shape[1]
assert_array_equal(current.data, expected.data)
assert_array_equal(current.indices, expected.indices)
assert_array_equal(current.indptr, expected.indptr)
def _make_dense_dataset(float_dtype):
if float_dtype == np.float32:
return ArrayDataset32(X32, y32, sample_weight32, seed=42)
return ArrayDataset64(X64, y64, sample_weight64, seed=42)
def _make_sparse_dataset(csr_container, float_dtype):
if float_dtype == np.float32:
X, y, sample_weight, csr_dataset = X32, y32, sample_weight32, CSRDataset32
else:
X, y, sample_weight, csr_dataset = X64, y64, sample_weight64, CSRDataset64
X = csr_container(X)
return csr_dataset(X.data, X.indptr, X.indices, y, sample_weight, seed=42)
def _make_dense_datasets():
return [_make_dense_dataset(float_dtype) for float_dtype in floating]
def _make_sparse_datasets():
return [
_make_sparse_dataset(csr_container, float_dtype)
for csr_container, float_dtype in product(CSR_CONTAINERS, floating)
]
def _make_fused_types_datasets():
all_datasets = _make_dense_datasets() + _make_sparse_datasets()
# group dataset by array types to get a tuple (float32, float64)
return (all_datasets[idx : idx + 2] for idx in range(0, len(all_datasets), 2))
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("dataset", _make_dense_datasets() + _make_sparse_datasets())
def test_seq_dataset_basic_iteration(dataset, csr_container):
NUMBER_OF_RUNS = 5
X_csr64 = csr_container(X64)
for _ in range(NUMBER_OF_RUNS):
# next sample
xi_, yi, swi, idx = dataset._next_py()
xi = csr_container(xi_, shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[[idx]])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
# random sample
xi_, yi, swi, idx = dataset._random_py()
xi = csr_container(xi_, shape=(1, X64.shape[1]))
assert_csr_equal_values(xi, X_csr64[[idx]])
assert yi == y64[idx]
assert swi == sample_weight64[idx]
@pytest.mark.parametrize(
"dense_dataset,sparse_dataset",
[
(
_make_dense_dataset(float_dtype),
_make_sparse_dataset(csr_container, float_dtype),
)
for float_dtype, csr_container in product(floating, CSR_CONTAINERS)
],
)
def test_seq_dataset_shuffle(dense_dataset, sparse_dataset):
# not shuffled
for i in range(5):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
for i in [132, 50, 9, 18, 58]:
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == i
assert idx2 == i
seed = 77
dense_dataset._shuffle_py(seed)
sparse_dataset._shuffle_py(seed)
idx_next = [63, 91, 148, 87, 29]
idx_shuffle = [137, 125, 56, 121, 127]
for i, j in zip(idx_next, idx_shuffle):
_, _, _, idx1 = dense_dataset._next_py()
_, _, _, idx2 = sparse_dataset._next_py()
assert idx1 == i
assert idx2 == i
_, _, _, idx1 = dense_dataset._random_py()
_, _, _, idx2 = sparse_dataset._random_py()
assert idx1 == j
assert idx2 == j
@pytest.mark.parametrize("dataset_32,dataset_64", _make_fused_types_datasets())
def test_fused_types_consistency(dataset_32, dataset_64):
NUMBER_OF_RUNS = 5
for _ in range(NUMBER_OF_RUNS):
# next sample
(xi_data32, _, _), yi32, _, _ = dataset_32._next_py()
(xi_data64, _, _), yi64, _, _ = dataset_64._next_py()
assert xi_data32.dtype == np.float32
assert xi_data64.dtype == np.float64
assert_allclose(xi_data64, xi_data32, rtol=1e-5)
assert_allclose(yi64, yi32, rtol=1e-5)
def test_buffer_dtype_mismatch_error():
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
ArrayDataset64(X32, y32, sample_weight32, seed=42),
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
ArrayDataset32(X64, y64, sample_weight64, seed=42),
for csr_container in CSR_CONTAINERS:
X_csr32 = csr_container(X32)
X_csr64 = csr_container(X64)
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
CSRDataset64(
X_csr32.data,
X_csr32.indptr,
X_csr32.indices,
y32,
sample_weight32,
seed=42,
),
with pytest.raises(ValueError, match="Buffer dtype mismatch"):
CSRDataset32(
X_csr64.data,
X_csr64.indptr,
X_csr64.indices,
y64,
sample_weight64,
seed=42,
),

View File

@@ -0,0 +1,464 @@
import importlib
from collections import namedtuple
import numpy as np
import pytest
from numpy.testing import assert_array_equal
from sklearn._config import config_context, get_config
from sklearn.preprocessing import StandardScaler
from sklearn.utils._set_output import (
ADAPTERS_MANAGER,
ContainerAdapterProtocol,
_get_adapter_from_container,
_get_output_config,
_safe_set_output,
_SetOutputMixin,
_wrap_data_with_container,
check_library_installed,
)
from sklearn.utils.fixes import CSR_CONTAINERS
def test_pandas_adapter():
"""Check pandas adapter has expected behavior."""
pd = pytest.importorskip("pandas")
X_np = np.asarray([[1, 0, 3], [0, 0, 1]])
columns = np.asarray(["f0", "f1", "f2"], dtype=object)
index = np.asarray([0, 1])
X_df_orig = pd.DataFrame([[1, 2], [1, 3]], index=index)
adapter = ADAPTERS_MANAGER.adapters["pandas"]
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
assert isinstance(X_container, pd.DataFrame)
assert_array_equal(X_container.columns, columns)
assert_array_equal(X_container.index, index)
# Input dataframe's index does not change
new_columns = np.asarray(["f0", "f1"], dtype=object)
X_df = pd.DataFrame([[1, 2], [1, 3]], index=[10, 12])
new_df = adapter.create_container(X_df, X_df_orig, columns=new_columns)
assert_array_equal(new_df.columns, new_columns)
assert_array_equal(new_df.index, X_df.index)
assert adapter.is_supported_container(X_df)
assert not adapter.is_supported_container(X_np)
# adapter.update_columns updates the columns
new_columns = np.array(["a", "c"], dtype=object)
new_df = adapter.rename_columns(X_df, new_columns)
assert_array_equal(new_df.columns, new_columns)
# adapter.hstack stacks the dataframes horizontally.
X_df_1 = pd.DataFrame([[1, 2, 5], [3, 4, 6]], columns=["a", "b", "e"])
X_df_2 = pd.DataFrame([[4], [5]], columns=["c"])
X_stacked = adapter.hstack([X_df_1, X_df_2])
expected_df = pd.DataFrame(
[[1, 2, 5, 4], [3, 4, 6, 5]], columns=["a", "b", "e", "c"]
)
pd.testing.assert_frame_equal(X_stacked, expected_df)
# check that we update properly the columns even with duplicate column names
# this use-case potentially happen when using ColumnTransformer
# non-regression test for gh-28260
X_df = pd.DataFrame([[1, 2], [1, 3]], columns=["a", "a"])
new_columns = np.array(["x__a", "y__a"], dtype=object)
new_df = adapter.rename_columns(X_df, new_columns)
assert_array_equal(new_df.columns, new_columns)
# check the behavior of the inplace parameter in `create_container`
# we should trigger a copy
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=False)
assert X_output is not X_df
assert list(X_df.columns) == [0, 1]
assert list(X_output.columns) == ["a", "b"]
# the operation is inplace
X_df = pd.DataFrame([[1, 2], [1, 3]], index=index)
X_output = adapter.create_container(X_df, X_df, columns=["a", "b"], inplace=True)
assert X_output is X_df
assert list(X_df.columns) == ["a", "b"]
assert list(X_output.columns) == ["a", "b"]
def test_polars_adapter():
"""Check Polars adapter has expected behavior."""
pl = pytest.importorskip("polars")
X_np = np.array([[1, 0, 3], [0, 0, 1]])
columns = ["f1", "f2", "f3"]
X_df_orig = pl.DataFrame(X_np, schema=columns, orient="row")
adapter = ADAPTERS_MANAGER.adapters["polars"]
X_container = adapter.create_container(X_np, X_df_orig, columns=lambda: columns)
assert isinstance(X_container, pl.DataFrame)
assert_array_equal(X_container.columns, columns)
# Update columns with create_container
new_columns = np.asarray(["a", "b", "c"], dtype=object)
new_df = adapter.create_container(X_df_orig, X_df_orig, columns=new_columns)
assert_array_equal(new_df.columns, new_columns)
assert adapter.is_supported_container(X_df_orig)
assert not adapter.is_supported_container(X_np)
# adapter.update_columns updates the columns
new_columns = np.array(["a", "c", "g"], dtype=object)
new_df = adapter.rename_columns(X_df_orig, new_columns)
assert_array_equal(new_df.columns, new_columns)
# adapter.hstack stacks the dataframes horizontally.
X_df_1 = pl.DataFrame([[1, 2, 5], [3, 4, 6]], schema=["a", "b", "e"], orient="row")
X_df_2 = pl.DataFrame([[4], [5]], schema=["c"], orient="row")
X_stacked = adapter.hstack([X_df_1, X_df_2])
expected_df = pl.DataFrame(
[[1, 2, 5, 4], [3, 4, 6, 5]], schema=["a", "b", "e", "c"], orient="row"
)
from polars.testing import assert_frame_equal
assert_frame_equal(X_stacked, expected_df)
# check the behavior of the inplace parameter in `create_container`
# we should trigger a copy
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=False)
assert X_output is not X_df
assert list(X_df.columns) == ["a", "b"]
assert list(X_output.columns) == ["c", "d"]
# the operation is inplace
X_df = pl.DataFrame([[1, 2], [1, 3]], schema=["a", "b"], orient="row")
X_output = adapter.create_container(X_df, X_df, columns=["c", "d"], inplace=True)
assert X_output is X_df
assert list(X_df.columns) == ["c", "d"]
assert list(X_output.columns) == ["c", "d"]
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test__container_error_validation(csr_container):
"""Check errors in _wrap_data_with_container."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
X_csr = csr_container(X)
match = "The transformer outputs a scipy sparse matrix."
with config_context(transform_output="pandas"):
with pytest.raises(ValueError, match=match):
_wrap_data_with_container("transform", X_csr, X, StandardScaler())
class EstimatorWithoutSetOutputAndWithoutTransform:
pass
class EstimatorNoSetOutputWithTransform:
def transform(self, X, y=None):
return X # pragma: no cover
class EstimatorWithSetOutput(_SetOutputMixin):
def fit(self, X, y=None):
self.n_features_in_ = X.shape[1]
return self
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
def test__safe_set_output():
"""Check _safe_set_output works as expected."""
# Estimator without transform will not raise when setting set_output for transform.
est = EstimatorWithoutSetOutputAndWithoutTransform()
_safe_set_output(est, transform="pandas")
# Estimator with transform but without set_output will raise
est = EstimatorNoSetOutputWithTransform()
with pytest.raises(ValueError, match="Unable to configure output"):
_safe_set_output(est, transform="pandas")
est = EstimatorWithSetOutput().fit(np.asarray([[1, 2, 3]]))
_safe_set_output(est, transform="pandas")
config = _get_output_config("transform", est)
assert config["dense"] == "pandas"
_safe_set_output(est, transform="default")
config = _get_output_config("transform", est)
assert config["dense"] == "default"
# transform is None is a no-op, so the config remains "default"
_safe_set_output(est, transform=None)
config = _get_output_config("transform", est)
assert config["dense"] == "default"
class EstimatorNoSetOutputWithTransformNoFeatureNamesOut(_SetOutputMixin):
def transform(self, X, y=None):
return X # pragma: no cover
def test_set_output_mixin():
"""Estimator without get_feature_names_out does not define `set_output`."""
est = EstimatorNoSetOutputWithTransformNoFeatureNamesOut()
assert not hasattr(est, "set_output")
def test__safe_set_output_error():
"""Check transform with invalid config."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput()
_safe_set_output(est, transform="bad")
msg = "output config must be in"
with pytest.raises(ValueError, match=msg):
est.transform(X)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_set_output_method(dataframe_lib):
"""Check that the output is a dataframe."""
lib = pytest.importorskip(dataframe_lib)
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput().fit(X)
# transform=None is a no-op
est2 = est.set_output(transform=None)
assert est2 is est
X_trans_np = est2.transform(X)
assert isinstance(X_trans_np, np.ndarray)
est.set_output(transform=dataframe_lib)
X_trans_pd = est.transform(X)
assert isinstance(X_trans_pd, lib.DataFrame)
def test_set_output_method_error():
"""Check transform fails with invalid transform."""
X = np.asarray([[1, 0, 3], [0, 0, 1]])
est = EstimatorWithSetOutput().fit(X)
est.set_output(transform="bad")
msg = "output config must be in"
with pytest.raises(ValueError, match=msg):
est.transform(X)
@pytest.mark.parametrize("transform_output", ["pandas", "polars"])
def test__get_output_config(transform_output):
"""Check _get_output_config works as expected."""
# Without a configuration set, the global config is used
global_config = get_config()["transform_output"]
config = _get_output_config("transform")
assert config["dense"] == global_config
with config_context(transform_output=transform_output):
# with estimator=None, the global config is used
config = _get_output_config("transform")
assert config["dense"] == transform_output
est = EstimatorNoSetOutputWithTransform()
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
est = EstimatorWithSetOutput()
# If estimator has not config, use global config
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
# If estimator has a config, use local config
est.set_output(transform="default")
config = _get_output_config("transform", est)
assert config["dense"] == "default"
est.set_output(transform=transform_output)
config = _get_output_config("transform", est)
assert config["dense"] == transform_output
class EstimatorWithSetOutputNoAutoWrap(_SetOutputMixin, auto_wrap_output_keys=None):
def transform(self, X, y=None):
return X
def test_get_output_auto_wrap_false():
"""Check that auto_wrap_output_keys=None does not wrap."""
est = EstimatorWithSetOutputNoAutoWrap()
assert not hasattr(est, "set_output")
X = np.asarray([[1, 0, 3], [0, 0, 1]])
assert X is est.transform(X)
def test_auto_wrap_output_keys_errors_with_incorrect_input():
msg = "auto_wrap_output_keys must be None or a tuple of keys."
with pytest.raises(ValueError, match=msg):
class BadEstimator(_SetOutputMixin, auto_wrap_output_keys="bad_parameter"):
pass
class AnotherMixin:
def __init_subclass__(cls, custom_parameter, **kwargs):
super().__init_subclass__(**kwargs)
cls.custom_parameter = custom_parameter
def test_set_output_mixin_custom_mixin():
"""Check that multiple init_subclasses passes parameters up."""
class BothMixinEstimator(_SetOutputMixin, AnotherMixin, custom_parameter=123):
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return input_features
est = BothMixinEstimator()
assert est.custom_parameter == 123
assert hasattr(est, "set_output")
def test_set_output_mro():
"""Check that multi-inheritance resolves to the correct class method.
Non-regression test gh-25293.
"""
class Base(_SetOutputMixin):
def transform(self, X):
return "Base" # noqa
class A(Base):
pass
class B(Base):
def transform(self, X):
return "B"
class C(A, B):
pass
assert C().transform(None) == "B"
class EstimatorWithSetOutputIndex(_SetOutputMixin):
def fit(self, X, y=None):
self.n_features_in_ = X.shape[1]
return self
def transform(self, X, y=None):
import pandas as pd
# transform by giving output a new index.
return pd.DataFrame(X.to_numpy(), index=[f"s{i}" for i in range(X.shape[0])])
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
def test_set_output_pandas_keep_index():
"""Check that set_output does not override index.
Non-regression test for gh-25730.
"""
pd = pytest.importorskip("pandas")
X = pd.DataFrame([[1, 2, 3], [4, 5, 6]], index=[0, 1])
est = EstimatorWithSetOutputIndex().set_output(transform="pandas")
est.fit(X)
X_trans = est.transform(X)
assert_array_equal(X_trans.index, ["s0", "s1"])
class EstimatorReturnTuple(_SetOutputMixin):
def __init__(self, OutputTuple):
self.OutputTuple = OutputTuple
def transform(self, X, y=None):
return self.OutputTuple(X, 2 * X)
def test_set_output_named_tuple_out():
"""Check that namedtuples are kept by default."""
Output = namedtuple("Output", "X, Y")
X = np.asarray([[1, 2, 3]])
est = EstimatorReturnTuple(OutputTuple=Output)
X_trans = est.transform(X)
assert isinstance(X_trans, Output)
assert_array_equal(X_trans.X, X)
assert_array_equal(X_trans.Y, 2 * X)
class EstimatorWithListInput(_SetOutputMixin):
def fit(self, X, y=None):
assert isinstance(X, list)
self.n_features_in_ = len(X[0])
return self
def transform(self, X, y=None):
return X
def get_feature_names_out(self, input_features=None):
return np.asarray([f"X{i}" for i in range(self.n_features_in_)], dtype=object)
@pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
def test_set_output_list_input(dataframe_lib):
"""Check set_output for list input.
Non-regression test for #27037.
"""
lib = pytest.importorskip(dataframe_lib)
X = [[0, 1, 2, 3], [4, 5, 6, 7]]
est = EstimatorWithListInput()
est.set_output(transform=dataframe_lib)
X_out = est.fit(X).transform(X)
assert isinstance(X_out, lib.DataFrame)
assert_array_equal(X_out.columns, ["X0", "X1", "X2", "X3"])
@pytest.mark.parametrize("name", sorted(ADAPTERS_MANAGER.adapters))
def test_adapter_class_has_interface(name):
"""Check adapters have the correct interface."""
assert isinstance(ADAPTERS_MANAGER.adapters[name], ContainerAdapterProtocol)
def test_check_library_installed(monkeypatch):
"""Check import error changed."""
orig_import_module = importlib.import_module
def patched_import_module(name):
if name == "pandas":
raise ImportError()
orig_import_module(name, package=None)
monkeypatch.setattr(importlib, "import_module", patched_import_module)
msg = "Setting output container to 'pandas' requires"
with pytest.raises(ImportError, match=msg):
check_library_installed("pandas")
def test_get_adapter_from_container():
"""Check the behavior fo `_get_adapter_from_container`."""
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"a": [1, 2, 3], "b": [10, 20, 100]})
adapter = _get_adapter_from_container(X)
assert adapter.container_lib == "pandas"
err_msg = "The container does not have a registered adapter in scikit-learn."
with pytest.raises(ValueError, match=err_msg):
_get_adapter_from_container(X.to_numpy())

View File

@@ -0,0 +1,65 @@
from collections import defaultdict
import numpy as np
from numpy.testing import assert_array_almost_equal
from sklearn.utils.graph import single_source_shortest_path_length
def floyd_warshall_slow(graph, directed=False):
N = graph.shape[0]
# set nonzero entries to infinity
graph[np.where(graph == 0)] = np.inf
# set diagonal to zero
graph.flat[:: N + 1] = 0
if not directed:
graph = np.minimum(graph, graph.T)
for k in range(N):
for i in range(N):
for j in range(N):
graph[i, j] = min(graph[i, j], graph[i, k] + graph[k, j])
graph[np.where(np.isinf(graph))] = 0
return graph
def generate_graph(N=20):
# sparse grid of distances
rng = np.random.RandomState(0)
dist_matrix = rng.random_sample((N, N))
# make symmetric: distances are not direction-dependent
dist_matrix = dist_matrix + dist_matrix.T
# make graph sparse
i = (rng.randint(N, size=N * N // 2), rng.randint(N, size=N * N // 2))
dist_matrix[i] = 0
# set diagonal to zero
dist_matrix.flat[:: N + 1] = 0
return dist_matrix
def test_shortest_path():
dist_matrix = generate_graph(20)
# We compare path length and not costs (-> set distances to 0 or 1)
dist_matrix[dist_matrix != 0] = 1
for directed in (True, False):
if not directed:
dist_matrix = np.minimum(dist_matrix, dist_matrix.T)
graph_py = floyd_warshall_slow(dist_matrix.copy(), directed)
for i in range(dist_matrix.shape[0]):
# Non-reachable nodes have distance 0 in graph_py
dist_dict = defaultdict(int)
dist_dict.update(single_source_shortest_path_length(dist_matrix, i))
for j in range(graph_py[i].shape[0]):
assert_array_almost_equal(dist_dict[j], graph_py[i, j])

View File

@@ -0,0 +1,40 @@
from threadpoolctl import threadpool_info
from sklearn.utils._show_versions import _get_deps_info, _get_sys_info, show_versions
from sklearn.utils._testing import ignore_warnings
def test_get_sys_info():
sys_info = _get_sys_info()
assert "python" in sys_info
assert "executable" in sys_info
assert "machine" in sys_info
def test_get_deps_info():
with ignore_warnings():
deps_info = _get_deps_info()
assert "pip" in deps_info
assert "setuptools" in deps_info
assert "sklearn" in deps_info
assert "numpy" in deps_info
assert "scipy" in deps_info
assert "Cython" in deps_info
assert "pandas" in deps_info
assert "matplotlib" in deps_info
assert "joblib" in deps_info
def test_show_versions(capsys):
with ignore_warnings():
show_versions()
out, err = capsys.readouterr()
assert "python" in out
assert "numpy" in out
info = threadpool_info()
if info:
assert "threadpoolctl info:" in out

View File

@@ -0,0 +1,998 @@
import numpy as np
import pytest
import scipy.sparse as sp
from numpy.random import RandomState
from numpy.testing import assert_array_almost_equal, assert_array_equal
from scipy import linalg
from sklearn.datasets import make_classification
from sklearn.utils._testing import assert_allclose
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
from sklearn.utils.sparsefuncs import (
_implicit_column_offset,
count_nonzero,
csc_median_axis_0,
incr_mean_variance_axis,
inplace_column_scale,
inplace_row_scale,
inplace_swap_column,
inplace_swap_row,
mean_variance_axis,
min_max_axis,
)
from sklearn.utils.sparsefuncs_fast import (
assign_rows_csr,
csr_row_norms,
inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2,
)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_mean_variance_axis0(csc_container, csr_container, lil_container):
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_lil = lil_container(X)
X_lil[1, 0] = 0
X[1, 0] = 0
with pytest.raises(TypeError):
mean_variance_axis(X_lil, axis=0)
X_csr = csr_container(X_lil)
X_csc = csc_container(X_lil)
expected_dtypes = [
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64),
]
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert X_means.dtype == output_dtype
assert X_vars.dtype == output_dtype
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
def test_mean_variance_axis0_precision(dtype, sparse_constructor):
# Check that there's no big loss of precision when the real variance is
# exactly 0. (#19766)
rng = np.random.RandomState(0)
X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
# Add some missing records which should be ignored:
missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
X[missing_indices, 0] = np.nan
X = sparse_constructor(X)
# Random positive weights:
sample_weight = rng.rand(X.shape[0]).astype(dtype)
_, var = mean_variance_axis(X, weights=sample_weight, axis=0)
assert var < np.finfo(dtype).eps
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_mean_variance_axis1(csc_container, csr_container, lil_container):
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_lil = lil_container(X)
X_lil[1, 0] = 0
X[1, 0] = 0
with pytest.raises(TypeError):
mean_variance_axis(X_lil, axis=1)
X_csr = csr_container(X_lil)
X_csc = csc_container(X_lil)
expected_dtypes = [
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64),
]
for input_dtype, output_dtype in expected_dtypes:
X_test = X.astype(input_dtype)
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
assert X_means.dtype == output_dtype
assert X_vars.dtype == output_dtype
assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
@pytest.mark.parametrize(
["Xw", "X", "weights"],
[
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1, 1]),
([[0, 0, 1], [0, 1, 1]], [[0, 0, 0, 1], [0, 1, 1, 1]], [1, 2, 1]),
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
(
[[0, np.nan, 2], [0, np.nan, np.nan]],
[[0, np.nan, 2], [0, np.nan, np.nan]],
[1.0, 1.0, 1.0],
),
(
[[0, 0], [1, np.nan], [2, 0], [0, 3], [np.nan, np.nan], [np.nan, 2]],
[
[0, 0, 0],
[1, 1, np.nan],
[2, 2, 0],
[0, 0, 3],
[np.nan, np.nan, np.nan],
[np.nan, np.nan, 2],
],
[2.0, 1.0],
),
(
[[1, 0, 1], [0, 3, 1]],
[[1, 0, 0, 0, 1], [0, 3, 3, 3, 1]],
np.array([1, 3, 1]),
),
],
)
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_incr_mean_variance_axis_weighted_axis1(
Xw, X, weights, sparse_constructor, dtype
):
axis = 1
Xw_sparse = sparse_constructor(Xw).astype(dtype)
X_sparse = sparse_constructor(X).astype(dtype)
last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
last_var = np.zeros_like(last_mean, dtype=dtype)
last_n = np.zeros_like(last_mean, dtype=np.int64)
means0, vars0, n_incr0 = incr_mean_variance_axis(
X=X_sparse,
axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n,
weights=None,
)
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
X=Xw_sparse,
axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n,
weights=weights,
)
assert means_w0.dtype == dtype
assert vars_w0.dtype == dtype
assert n_incr_w0.dtype == dtype
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
assert_array_almost_equal(means0, means_w0)
assert_array_almost_equal(means0, means_simple)
assert_array_almost_equal(vars0, vars_w0)
assert_array_almost_equal(vars0, vars_simple)
assert_array_almost_equal(n_incr0, n_incr_w0)
# check second round for incremental
means1, vars1, n_incr1 = incr_mean_variance_axis(
X=X_sparse,
axis=axis,
last_mean=means0,
last_var=vars0,
last_n=n_incr0,
weights=None,
)
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
X=Xw_sparse,
axis=axis,
last_mean=means_w0,
last_var=vars_w0,
last_n=n_incr_w0,
weights=weights,
)
assert_array_almost_equal(means1, means_w1)
assert_array_almost_equal(vars1, vars_w1)
assert_array_almost_equal(n_incr1, n_incr_w1)
assert means_w1.dtype == dtype
assert vars_w1.dtype == dtype
assert n_incr_w1.dtype == dtype
@pytest.mark.parametrize(
["Xw", "X", "weights"],
[
([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1]),
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1], [0, 1, 1]], [1, 2]),
([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
(
[[0, np.nan, 2], [0, np.nan, np.nan]],
[[0, np.nan, 2], [0, np.nan, np.nan]],
[1.0, 1.0],
),
(
[[0, 0, 1, np.nan, 2, 0], [0, 3, np.nan, np.nan, np.nan, 2]],
[
[0, 0, 1, np.nan, 2, 0],
[0, 0, 1, np.nan, 2, 0],
[0, 3, np.nan, np.nan, np.nan, 2],
],
[2.0, 1.0],
),
(
[[1, 0, 1], [0, 0, 1]],
[[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
np.array([1, 3]),
),
],
)
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_incr_mean_variance_axis_weighted_axis0(
Xw, X, weights, sparse_constructor, dtype
):
axis = 0
Xw_sparse = sparse_constructor(Xw).astype(dtype)
X_sparse = sparse_constructor(X).astype(dtype)
last_mean = np.zeros(np.size(Xw, 1), dtype=dtype)
last_var = np.zeros_like(last_mean)
last_n = np.zeros_like(last_mean, dtype=np.int64)
means0, vars0, n_incr0 = incr_mean_variance_axis(
X=X_sparse,
axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n,
weights=None,
)
means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
X=Xw_sparse,
axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n,
weights=weights,
)
assert means_w0.dtype == dtype
assert vars_w0.dtype == dtype
assert n_incr_w0.dtype == dtype
means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
assert_array_almost_equal(means0, means_w0)
assert_array_almost_equal(means0, means_simple)
assert_array_almost_equal(vars0, vars_w0)
assert_array_almost_equal(vars0, vars_simple)
assert_array_almost_equal(n_incr0, n_incr_w0)
# check second round for incremental
means1, vars1, n_incr1 = incr_mean_variance_axis(
X=X_sparse,
axis=axis,
last_mean=means0,
last_var=vars0,
last_n=n_incr0,
weights=None,
)
means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
X=Xw_sparse,
axis=axis,
last_mean=means_w0,
last_var=vars_w0,
last_n=n_incr_w0,
weights=weights,
)
assert_array_almost_equal(means1, means_w1)
assert_array_almost_equal(vars1, vars_w1)
assert_array_almost_equal(n_incr1, n_incr_w1)
assert means_w1.dtype == dtype
assert vars_w1.dtype == dtype
assert n_incr_w1.dtype == dtype
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
@pytest.mark.parametrize("lil_container", LIL_CONTAINERS)
def test_incr_mean_variance_axis(csc_container, csr_container, lil_container):
for axis in [0, 1]:
rng = np.random.RandomState(0)
n_features = 50
n_samples = 10
if axis == 0:
data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)]
else:
data_chunks = [rng.randint(0, 2, size=n_samples) for i in range(n_features)]
# default params for incr_mean_variance
last_mean = np.zeros(n_features) if axis == 0 else np.zeros(n_samples)
last_var = np.zeros_like(last_mean)
last_n = np.zeros_like(last_mean, dtype=np.int64)
# Test errors
X = np.array(data_chunks[0])
X = np.atleast_2d(X)
X = X.T if axis == 1 else X
X_lil = lil_container(X)
X_csr = csr_container(X_lil)
with pytest.raises(TypeError):
incr_mean_variance_axis(
X=axis, axis=last_mean, last_mean=last_var, last_var=last_n
)
with pytest.raises(TypeError):
incr_mean_variance_axis(
X_lil, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
# Test _incr_mean_and_var with a 1 row input
X_means, X_vars = mean_variance_axis(X_csr, axis)
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
X_csr, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
# X.shape[axis] picks # samples
assert_array_equal(X.shape[axis], n_incr)
X_csc = csc_container(X_lil)
X_means, X_vars = mean_variance_axis(X_csc, axis)
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_array_equal(X.shape[axis], n_incr)
# Test _incremental_mean_and_var with whole data
X = np.vstack(data_chunks)
X = X.T if axis == 1 else X
X_lil = lil_container(X)
X_csr = csr_container(X_lil)
X_csc = csc_container(X_lil)
expected_dtypes = [
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64),
]
for input_dtype, output_dtype in expected_dtypes:
for X_sparse in (X_csr, X_csc):
X_sparse = X_sparse.astype(input_dtype)
last_mean = last_mean.astype(output_dtype)
last_var = last_var.astype(output_dtype)
X_means, X_vars = mean_variance_axis(X_sparse, axis)
X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
X_sparse,
axis=axis,
last_mean=last_mean,
last_var=last_var,
last_n=last_n,
)
assert X_means_incr.dtype == output_dtype
assert X_vars_incr.dtype == output_dtype
assert_array_almost_equal(X_means, X_means_incr)
assert_array_almost_equal(X_vars, X_vars_incr)
assert_array_equal(X.shape[axis], n_incr)
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor):
"""Check that we raise proper error when axis=1 and the dimension mismatch.
Non-regression test for:
https://github.com/scikit-learn/scikit-learn/pull/18655
"""
n_samples, n_features = 60, 4
rng = np.random.RandomState(42)
X = sparse_constructor(rng.rand(n_samples, n_features))
last_mean = np.zeros(n_features)
last_var = np.zeros_like(last_mean)
last_n = np.zeros(last_mean.shape, dtype=np.int64)
kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n)
mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs)
assert_allclose(np.mean(X.toarray(), axis=0), mean0)
assert_allclose(np.var(X.toarray(), axis=0), var0)
# test ValueError if axis=1 and last_mean.size == n_features
with pytest.raises(ValueError):
incr_mean_variance_axis(X, axis=1, **kwargs)
# test inconsistent shapes of last_mean, last_var, last_n
kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n)
with pytest.raises(ValueError):
incr_mean_variance_axis(X, axis=0, **kwargs)
@pytest.mark.parametrize(
"X1, X2",
[
(
sp.random(5, 2, density=0.8, format="csr", random_state=0),
sp.random(13, 2, density=0.8, format="csr", random_state=0),
),
(
sp.random(5, 2, density=0.8, format="csr", random_state=0),
sp.hstack(
[
np.full((13, 1), fill_value=np.nan),
sp.random(13, 1, density=0.8, random_state=42),
],
format="csr",
),
),
],
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2, csr_container):
# non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/16448
# check that computing the incremental mean and variance is equivalent to
# computing the mean and variance on the stacked dataset.
X1 = csr_container(X1)
X2 = csr_container(X2)
axis = 0
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
last_n = np.zeros(X1.shape[1], dtype=np.int64)
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X2, axis=axis, last_mean=updated_mean, last_var=updated_var, last_n=updated_n
)
X = sp.vstack([X1, X2])
assert_allclose(updated_mean, np.nanmean(X.toarray(), axis=axis))
assert_allclose(updated_var, np.nanvar(X.toarray(), axis=axis))
assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.toarray()), axis=0))
def test_incr_mean_variance_no_new_n():
# check the behaviour when we update the variance with an empty matrix
axis = 0
X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
last_n = np.zeros(X1.shape[1], dtype=np.int64)
last_mean, last_var, last_n = incr_mean_variance_axis(
X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
# update statistic with a column which should ignored
updated_mean, updated_var, updated_n = incr_mean_variance_axis(
X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
assert_allclose(updated_mean, last_mean)
assert_allclose(updated_var, last_var)
assert_allclose(updated_n, last_n)
def test_incr_mean_variance_n_float():
# check the behaviour when last_n is just a number
axis = 0
X = sp.random(5, 2, density=0.8, random_state=0).tocsr()
last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1])
last_n = 0
_, _, new_n = incr_mean_variance_axis(
X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
)
assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("sparse_constructor", CSC_CONTAINERS + CSR_CONTAINERS)
def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
old_means = np.array([535.0, 535.0, 535.0, 535.0])
old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
X = sparse_constructor(
np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
)
X_nan = sparse_constructor(
np.array(
[
[170, np.nan, 170, 170],
[np.nan, 170, 430, 430],
[430, 430, np.nan, 300],
[300, 300, 300, np.nan],
]
)
)
# we avoid creating specific data for axis 0 and 1: translating the data is
# enough.
if axis:
X = X.T
X_nan = X_nan.T
# take a copy of the old statistics since they are modified in place.
X_means, X_vars, X_sample_count = incr_mean_variance_axis(
X,
axis=axis,
last_mean=old_means.copy(),
last_var=old_variances.copy(),
last_n=old_sample_count.copy(),
)
X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
X_nan,
axis=axis,
last_mean=old_means.copy(),
last_var=old_variances.copy(),
last_n=old_sample_count.copy(),
)
assert_allclose(X_nan_means, X_means)
assert_allclose(X_nan_vars, X_vars)
assert_allclose(X_nan_sample_count, X_sample_count)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_mean_variance_illegal_axis(csr_container):
X, _ = make_classification(5, 4, random_state=0)
# Sparsify the array a little bit
X[0, 0] = 0
X[2, 1] = 0
X[4, 3] = 0
X_csr = csr_container(X)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=-3)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=2)
with pytest.raises(ValueError):
mean_variance_axis(X_csr, axis=-1)
with pytest.raises(ValueError):
incr_mean_variance_axis(
X_csr, axis=-3, last_mean=None, last_var=None, last_n=None
)
with pytest.raises(ValueError):
incr_mean_variance_axis(
X_csr, axis=2, last_mean=None, last_var=None, last_n=None
)
with pytest.raises(ValueError):
incr_mean_variance_axis(
X_csr, axis=-1, last_mean=None, last_var=None, last_n=None
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_densify_rows(csr_container):
for dtype in (np.float32, np.float64):
X = csr_container(
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=dtype
)
X_rows = np.array([0, 2, 3], dtype=np.intp)
out = np.ones((6, X.shape[1]), dtype=dtype)
out_rows = np.array([1, 3, 4], dtype=np.intp)
expect = np.ones_like(out)
expect[out_rows] = X[X_rows, :].toarray()
assign_rows_csr(X, X_rows, out_rows, out)
assert_array_equal(out, expect)
def test_inplace_column_scale():
rng = np.random.RandomState(0)
X = sp.rand(100, 200, 0.05)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
scale = rng.rand(200)
XA *= scale
inplace_column_scale(Xc, scale)
inplace_column_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
X = X.astype(np.float32)
scale = scale.astype(np.float32)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
XA *= scale
inplace_column_scale(Xc, scale)
inplace_column_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
def test_inplace_row_scale():
rng = np.random.RandomState(0)
X = sp.rand(100, 200, 0.05)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
scale = rng.rand(100)
XA *= scale.reshape(-1, 1)
inplace_row_scale(Xc, scale)
inplace_row_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
X = X.astype(np.float32)
scale = scale.astype(np.float32)
Xr = X.tocsr()
Xc = X.tocsc()
XA = X.toarray()
XA *= scale.reshape(-1, 1)
inplace_row_scale(Xc, scale)
inplace_row_scale(Xr, scale)
assert_array_almost_equal(Xr.toarray(), Xc.toarray())
assert_array_almost_equal(XA, Xc.toarray())
assert_array_almost_equal(XA, Xr.toarray())
with pytest.raises(TypeError):
inplace_column_scale(X.tolil(), scale)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_inplace_swap_row(csc_container, csr_container):
X = np.array(
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
)
X_csr = csr_container(X)
X_csc = csc_container(X)
swap = linalg.get_blas_funcs(("swap",), (X,))
swap = swap[0]
X[0], X[-1] = swap(X[0], X[-1])
inplace_swap_row(X_csr, 0, -1)
inplace_swap_row(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[2], X[3] = swap(X[2], X[3])
inplace_swap_row(X_csr, 2, 3)
inplace_swap_row(X_csc, 2, 3)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_row(X_csr.tolil())
X = np.array(
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
)
X_csr = csr_container(X)
X_csc = csc_container(X)
swap = linalg.get_blas_funcs(("swap",), (X,))
swap = swap[0]
X[0], X[-1] = swap(X[0], X[-1])
inplace_swap_row(X_csr, 0, -1)
inplace_swap_row(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[2], X[3] = swap(X[2], X[3])
inplace_swap_row(X_csr, 2, 3)
inplace_swap_row(X_csc, 2, 3)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_row(X_csr.tolil())
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_inplace_swap_column(csc_container, csr_container):
X = np.array(
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
)
X_csr = csr_container(X)
X_csc = csc_container(X)
swap = linalg.get_blas_funcs(("swap",), (X,))
swap = swap[0]
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
inplace_swap_column(X_csr, 0, -1)
inplace_swap_column(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
inplace_swap_column(X_csr, 0, 1)
inplace_swap_column(X_csc, 0, 1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_column(X_csr.tolil())
X = np.array(
[[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
)
X_csr = csr_container(X)
X_csc = csc_container(X)
swap = linalg.get_blas_funcs(("swap",), (X,))
swap = swap[0]
X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
inplace_swap_column(X_csr, 0, -1)
inplace_swap_column(X_csc, 0, -1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
inplace_swap_column(X_csr, 0, 1)
inplace_swap_column(X_csc, 0, 1)
assert_array_equal(X_csr.toarray(), X_csc.toarray())
assert_array_equal(X, X_csc.toarray())
assert_array_equal(X, X_csr.toarray())
with pytest.raises(TypeError):
inplace_swap_column(X_csr.tolil())
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("axis", [0, 1, None])
@pytest.mark.parametrize("sparse_format", CSC_CONTAINERS + CSR_CONTAINERS)
@pytest.mark.parametrize(
"missing_values, min_func, max_func, ignore_nan",
[(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)],
)
@pytest.mark.parametrize("large_indices", [True, False])
def test_min_max(
dtype,
axis,
sparse_format,
missing_values,
min_func,
max_func,
ignore_nan,
large_indices,
):
X = np.array(
[
[0, 3, 0],
[2, -1, missing_values],
[0, 0, 0],
[9, missing_values, 7],
[4, 0, 5],
],
dtype=dtype,
)
X_sparse = sparse_format(X)
if large_indices:
X_sparse.indices = X_sparse.indices.astype("int64")
X_sparse.indptr = X_sparse.indptr.astype("int64")
mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)
assert_array_equal(mins_sparse, min_func(X, axis=axis))
assert_array_equal(maxs_sparse, max_func(X, axis=axis))
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_min_max_axis_errors(csc_container, csr_container):
X = np.array(
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
)
X_csr = csr_container(X)
X_csc = csc_container(X)
with pytest.raises(TypeError):
min_max_axis(X_csr.tolil(), axis=0)
with pytest.raises(ValueError):
min_max_axis(X_csr, axis=2)
with pytest.raises(ValueError):
min_max_axis(X_csc, axis=-3)
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_count_nonzero(csc_container, csr_container):
X = np.array(
[[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
)
X_csr = csr_container(X)
X_csc = csc_container(X)
X_nonzero = X != 0
sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
for axis in [0, 1, -1, -2, None]:
assert_array_almost_equal(
count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)
)
assert_array_almost_equal(
count_nonzero(X_csr, axis=axis, sample_weight=sample_weight),
X_nonzero_weighted.sum(axis=axis),
)
with pytest.raises(TypeError):
count_nonzero(X_csc)
with pytest.raises(ValueError):
count_nonzero(X_csr, axis=2)
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
assert (
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
)
# Check dtypes with large sparse matrices too
# XXX: test fails on 32bit (Windows/Linux)
try:
X_csr.indices = X_csr.indices.astype(np.int64)
X_csr.indptr = X_csr.indptr.astype(np.int64)
assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
assert (
count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
== count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
)
except TypeError as e:
assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_csc_row_median(csc_container, csr_container):
# Test csc_row_median actually calculates the median.
# Test that it gives the same output when X is dense.
rng = np.random.RandomState(0)
X = rng.rand(100, 50)
dense_median = np.median(X, axis=0)
csc = csc_container(X)
sparse_median = csc_median_axis_0(csc)
assert_array_equal(sparse_median, dense_median)
# Test that it gives the same output when X is sparse
X = rng.rand(51, 100)
X[X < 0.7] = 0.0
ind = rng.randint(0, 50, 10)
X[ind] = -X[ind]
csc = csc_container(X)
dense_median = np.median(X, axis=0)
sparse_median = csc_median_axis_0(csc)
assert_array_equal(sparse_median, dense_median)
# Test for toy data.
X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
csc = csc_container(X)
assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
X = [[0, -2], [-1, -5], [1, -3]]
csc = csc_container(X)
assert_array_equal(csc_median_axis_0(csc), np.array([0.0, -3]))
# Test that it raises an Error for non-csc matrices.
with pytest.raises(TypeError):
csc_median_axis_0(csr_container(X))
@pytest.mark.parametrize(
"inplace_csr_row_normalize",
(inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2),
)
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
def test_inplace_normalize(csr_container, inplace_csr_row_normalize):
if csr_container is sp.csr_matrix:
ones = np.ones((10, 1))
else:
ones = np.ones(10)
rs = RandomState(10)
for dtype in (np.float64, np.float32):
X = rs.randn(10, 5).astype(dtype)
X_csr = csr_container(X)
for index_dtype in [np.int32, np.int64]:
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if index_dtype is np.int64:
X_csr.indptr = X_csr.indptr.astype(index_dtype)
X_csr.indices = X_csr.indices.astype(index_dtype)
assert X_csr.indices.dtype == index_dtype
assert X_csr.indptr.dtype == index_dtype
inplace_csr_row_normalize(X_csr)
assert X_csr.dtype == dtype
if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
X_csr.data **= 2
assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_csr_row_norms(dtype):
# checks that csr_row_norms returns the same output as
# scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42)
scipy_norms = sp.linalg.norm(X, axis=1) ** 2
norms = csr_row_norms(X)
assert norms.dtype == dtype
rtol = 1e-6 if dtype == np.float32 else 1e-7
assert_allclose(norms, scipy_norms, rtol=rtol)
@pytest.fixture(scope="module", params=CSR_CONTAINERS + CSC_CONTAINERS)
def centered_matrices(request):
"""Returns equivalent tuple[sp.linalg.LinearOperator, np.ndarray]."""
sparse_container = request.param
random_state = np.random.default_rng(42)
X_sparse = sparse_container(
sp.random(500, 100, density=0.1, format="csr", random_state=random_state)
)
X_dense = X_sparse.toarray()
mu = np.asarray(X_sparse.mean(axis=0)).ravel()
X_sparse_centered = _implicit_column_offset(X_sparse, mu)
X_dense_centered = X_dense - mu
return X_sparse_centered, X_dense_centered
def test_implicit_center_matmat(global_random_seed, centered_matrices):
X_sparse_centered, X_dense_centered = centered_matrices
rng = np.random.default_rng(global_random_seed)
Y = rng.standard_normal((X_dense_centered.shape[1], 50))
assert_allclose(X_dense_centered @ Y, X_sparse_centered.matmat(Y))
assert_allclose(X_dense_centered @ Y, X_sparse_centered @ Y)
def test_implicit_center_matvec(global_random_seed, centered_matrices):
X_sparse_centered, X_dense_centered = centered_matrices
rng = np.random.default_rng(global_random_seed)
y = rng.standard_normal(X_dense_centered.shape[1])
assert_allclose(X_dense_centered @ y, X_sparse_centered.matvec(y))
assert_allclose(X_dense_centered @ y, X_sparse_centered @ y)
def test_implicit_center_rmatmat(global_random_seed, centered_matrices):
X_sparse_centered, X_dense_centered = centered_matrices
rng = np.random.default_rng(global_random_seed)
Y = rng.standard_normal((X_dense_centered.shape[0], 50))
assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.rmatmat(Y))
assert_allclose(X_dense_centered.T @ Y, X_sparse_centered.T @ Y)
def test_implit_center_rmatvec(global_random_seed, centered_matrices):
X_sparse_centered, X_dense_centered = centered_matrices
rng = np.random.default_rng(global_random_seed)
y = rng.standard_normal(X_dense_centered.shape[0])
assert_allclose(X_dense_centered.T @ y, X_sparse_centered.rmatvec(y))
assert_allclose(X_dense_centered.T @ y, X_sparse_centered.T @ y)

View File

@@ -0,0 +1,98 @@
import numpy as np
from numpy.testing import assert_allclose
from pytest import approx
from sklearn.utils.stats import _weighted_percentile
def test_weighted_percentile():
y = np.empty(102, dtype=np.float64)
y[:50] = 0
y[-51:] = 2
y[-1] = 100000
y[50] = 1
sw = np.ones(102, dtype=np.float64)
sw[-1] = 0.0
score = _weighted_percentile(y, sw, 50)
assert approx(score) == 1
def test_weighted_percentile_equal():
y = np.empty(102, dtype=np.float64)
y.fill(0.0)
sw = np.ones(102, dtype=np.float64)
sw[-1] = 0.0
score = _weighted_percentile(y, sw, 50)
assert score == 0
def test_weighted_percentile_zero_weight():
y = np.empty(102, dtype=np.float64)
y.fill(1.0)
sw = np.ones(102, dtype=np.float64)
sw.fill(0.0)
score = _weighted_percentile(y, sw, 50)
assert approx(score) == 1.0
def test_weighted_percentile_zero_weight_zero_percentile():
y = np.array([0, 1, 2, 3, 4, 5])
sw = np.array([0, 0, 1, 1, 1, 0])
score = _weighted_percentile(y, sw, 0)
assert approx(score) == 2
score = _weighted_percentile(y, sw, 50)
assert approx(score) == 3
score = _weighted_percentile(y, sw, 100)
assert approx(score) == 4
def test_weighted_median_equal_weights():
# Checks weighted percentile=0.5 is same as median when weights equal
rng = np.random.RandomState(0)
# Odd size as _weighted_percentile takes lower weighted percentile
x = rng.randint(10, size=11)
weights = np.ones(x.shape)
median = np.median(x)
w_median = _weighted_percentile(x, weights)
assert median == approx(w_median)
def test_weighted_median_integer_weights():
# Checks weighted percentile=0.5 is same as median when manually weight
# data
rng = np.random.RandomState(0)
x = rng.randint(20, size=10)
weights = rng.choice(5, size=10)
x_manual = np.repeat(x, weights)
median = np.median(x_manual)
w_median = _weighted_percentile(x, weights)
assert median == approx(w_median)
def test_weighted_percentile_2d():
# Check for when array 2D and sample_weight 1D
rng = np.random.RandomState(0)
x1 = rng.randint(10, size=10)
w1 = rng.choice(5, size=10)
x2 = rng.randint(20, size=10)
x_2d = np.vstack((x1, x2)).T
w_median = _weighted_percentile(x_2d, w1)
p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
assert_allclose(w_median, p_axis_0)
# Check when array and sample_weight boht 2D
w2 = rng.choice(5, size=10)
w_2d = np.vstack((w1, w2)).T
w_median = _weighted_percentile(x_2d, w_2d)
p_axis_0 = [
_weighted_percentile(x_2d[:, i], w_2d[:, i]) for i in range(x_2d.shape[1])
]
assert_allclose(w_median, p_axis_0)

View File

@@ -0,0 +1,47 @@
import pytest
from sklearn.base import BaseEstimator
from sklearn.utils._tags import (
_DEFAULT_TAGS,
_safe_tags,
)
class NoTagsEstimator:
pass
class MoreTagsEstimator:
def _more_tags(self):
return {"allow_nan": True}
@pytest.mark.parametrize(
"estimator, err_msg",
[
(BaseEstimator(), "The key xxx is not defined in _get_tags"),
(NoTagsEstimator(), "The key xxx is not defined in _DEFAULT_TAGS"),
],
)
def test_safe_tags_error(estimator, err_msg):
# Check that safe_tags raises error in ambiguous case.
with pytest.raises(ValueError, match=err_msg):
_safe_tags(estimator, key="xxx")
@pytest.mark.parametrize(
"estimator, key, expected_results",
[
(NoTagsEstimator(), None, _DEFAULT_TAGS),
(NoTagsEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
(MoreTagsEstimator(), None, {**_DEFAULT_TAGS, **{"allow_nan": True}}),
(MoreTagsEstimator(), "allow_nan", True),
(BaseEstimator(), None, _DEFAULT_TAGS),
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
(BaseEstimator(), "allow_nan", _DEFAULT_TAGS["allow_nan"]),
],
)
def test_safe_tags_no_get_tags(estimator, key, expected_results):
# check the behaviour of _safe_tags when an estimator does not implement
# _get_tags
assert _safe_tags(estimator, key=key) == expected_results

View File

@@ -0,0 +1,923 @@
import atexit
import os
import unittest
import warnings
import numpy as np
import pytest
from scipy import sparse
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils._testing import (
TempMemmap,
_convert_container,
_delete_folder,
_get_warnings_filters_info_list,
assert_allclose,
assert_allclose_dense_sparse,
assert_no_warnings,
assert_raise_message,
assert_raises,
assert_raises_regex,
assert_run_python_script_without_output,
check_docstring_parameters,
create_memmap_backed_data,
ignore_warnings,
raises,
set_random_state,
turn_warnings_into_errors,
)
from sklearn.utils.deprecation import deprecated
from sklearn.utils.fixes import (
_IS_WASM,
CSC_CONTAINERS,
CSR_CONTAINERS,
parse_version,
sp_version,
)
from sklearn.utils.metaestimators import available_if
def test_set_random_state():
lda = LinearDiscriminantAnalysis()
tree = DecisionTreeClassifier()
# Linear Discriminant Analysis doesn't have random state: smoke test
set_random_state(lda, 3)
set_random_state(tree, 3)
assert tree.random_state == 3
@pytest.mark.parametrize("csr_container", CSC_CONTAINERS)
def test_assert_allclose_dense_sparse(csr_container):
x = np.arange(9).reshape(3, 3)
msg = "Not equal to tolerance "
y = csr_container(x)
for X in [x, y]:
# basic compare
with pytest.raises(AssertionError, match=msg):
assert_allclose_dense_sparse(X, X * 2)
assert_allclose_dense_sparse(X, X)
with pytest.raises(ValueError, match="Can only compare two sparse"):
assert_allclose_dense_sparse(x, y)
A = sparse.diags(np.ones(5), offsets=0).tocsr()
B = csr_container(np.ones((1, 5)))
with pytest.raises(AssertionError, match="Arrays are not equal"):
assert_allclose_dense_sparse(B, A)
def test_assert_raises_msg():
with assert_raises_regex(AssertionError, "Hello world"):
with assert_raises(ValueError, msg="Hello world"):
pass
def test_assert_raise_message():
def _raise_ValueError(message):
raise ValueError(message)
def _no_raise():
pass
assert_raise_message(ValueError, "test", _raise_ValueError, "test")
assert_raises(
AssertionError,
assert_raise_message,
ValueError,
"something else",
_raise_ValueError,
"test",
)
assert_raises(
ValueError,
assert_raise_message,
TypeError,
"something else",
_raise_ValueError,
"test",
)
assert_raises(AssertionError, assert_raise_message, ValueError, "test", _no_raise)
# multiple exceptions in a tuple
assert_raises(
AssertionError,
assert_raise_message,
(ValueError, AttributeError),
"test",
_no_raise,
)
def test_ignore_warning():
# This check that ignore_warning decorator and context manager are working
# as expected
def _warning_function():
warnings.warn("deprecation warning", DeprecationWarning)
def _multiple_warning_function():
warnings.warn("deprecation warning", DeprecationWarning)
warnings.warn("deprecation warning")
# Check the function directly
assert_no_warnings(ignore_warnings(_warning_function))
assert_no_warnings(ignore_warnings(_warning_function, category=DeprecationWarning))
with pytest.warns(DeprecationWarning):
ignore_warnings(_warning_function, category=UserWarning)()
with pytest.warns() as record:
ignore_warnings(_multiple_warning_function, category=FutureWarning)()
assert len(record) == 2
assert isinstance(record[0].message, DeprecationWarning)
assert isinstance(record[1].message, UserWarning)
with pytest.warns() as record:
ignore_warnings(_multiple_warning_function, category=UserWarning)()
assert len(record) == 1
assert isinstance(record[0].message, DeprecationWarning)
assert_no_warnings(
ignore_warnings(_warning_function, category=(DeprecationWarning, UserWarning))
)
# Check the decorator
@ignore_warnings
def decorator_no_warning():
_warning_function()
_multiple_warning_function()
@ignore_warnings(category=(DeprecationWarning, UserWarning))
def decorator_no_warning_multiple():
_multiple_warning_function()
@ignore_warnings(category=DeprecationWarning)
def decorator_no_deprecation_warning():
_warning_function()
@ignore_warnings(category=UserWarning)
def decorator_no_user_warning():
_warning_function()
@ignore_warnings(category=DeprecationWarning)
def decorator_no_deprecation_multiple_warning():
_multiple_warning_function()
@ignore_warnings(category=UserWarning)
def decorator_no_user_multiple_warning():
_multiple_warning_function()
assert_no_warnings(decorator_no_warning)
assert_no_warnings(decorator_no_warning_multiple)
assert_no_warnings(decorator_no_deprecation_warning)
with pytest.warns(DeprecationWarning):
decorator_no_user_warning()
with pytest.warns(UserWarning):
decorator_no_deprecation_multiple_warning()
with pytest.warns(DeprecationWarning):
decorator_no_user_multiple_warning()
# Check the context manager
def context_manager_no_warning():
with ignore_warnings():
_warning_function()
def context_manager_no_warning_multiple():
with ignore_warnings(category=(DeprecationWarning, UserWarning)):
_multiple_warning_function()
def context_manager_no_deprecation_warning():
with ignore_warnings(category=DeprecationWarning):
_warning_function()
def context_manager_no_user_warning():
with ignore_warnings(category=UserWarning):
_warning_function()
def context_manager_no_deprecation_multiple_warning():
with ignore_warnings(category=DeprecationWarning):
_multiple_warning_function()
def context_manager_no_user_multiple_warning():
with ignore_warnings(category=UserWarning):
_multiple_warning_function()
assert_no_warnings(context_manager_no_warning)
assert_no_warnings(context_manager_no_warning_multiple)
assert_no_warnings(context_manager_no_deprecation_warning)
with pytest.warns(DeprecationWarning):
context_manager_no_user_warning()
with pytest.warns(UserWarning):
context_manager_no_deprecation_multiple_warning()
with pytest.warns(DeprecationWarning):
context_manager_no_user_multiple_warning()
# Check that passing warning class as first positional argument
warning_class = UserWarning
match = "'obj' should be a callable.+you should use 'category=UserWarning'"
with pytest.raises(ValueError, match=match):
silence_warnings_func = ignore_warnings(warning_class)(_warning_function)
silence_warnings_func()
with pytest.raises(ValueError, match=match):
@ignore_warnings(warning_class)
def test():
pass
class TestWarns(unittest.TestCase):
def test_warn(self):
def f():
warnings.warn("yo")
return 3
with pytest.raises(AssertionError):
assert_no_warnings(f)
assert assert_no_warnings(lambda x: x, 1) == 1
# Tests for docstrings:
def f_ok(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_bad_sections(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Results
-------
c : list
Parameter c
"""
c = a + b
return c
def f_bad_order(b, a):
"""Function f
Parameters
----------
a : int
Parameter a
b : float
Parameter b
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_too_many_param_docstring(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
b : int
Parameter b
c : int
Parameter c
Returns
-------
d : list
Parameter c
"""
d = a + b
return d
def f_missing(a, b):
"""Function f
Parameters
----------
a : int
Parameter a
Returns
-------
c : list
Parameter c
"""
c = a + b
return c
def f_check_param_definition(a, b, c, d, e):
"""Function f
Parameters
----------
a: int
Parameter a
b:
Parameter b
c :
This is parsed correctly in numpydoc 1.2
d:int
Parameter d
e
No typespec is allowed without colon
"""
return a + b + c + d
class Klass:
def f_missing(self, X, y):
pass
def f_bad_sections(self, X, y):
"""Function f
Parameter
---------
a : int
Parameter a
b : float
Parameter b
Results
-------
c : list
Parameter c
"""
pass
class MockEst:
def __init__(self):
"""MockEstimator"""
def fit(self, X, y):
return X
def predict(self, X):
return X
def predict_proba(self, X):
return X
def score(self, X):
return 1.0
class MockMetaEstimator:
def __init__(self, delegate):
"""MetaEstimator to check if doctest on delegated methods work.
Parameters
---------
delegate : estimator
Delegated estimator.
"""
self.delegate = delegate
@available_if(lambda self: hasattr(self.delegate, "predict"))
def predict(self, X):
"""This is available only if delegate has predict.
Parameters
----------
y : ndarray
Parameter y
"""
return self.delegate.predict(X)
@available_if(lambda self: hasattr(self.delegate, "score"))
@deprecated("Testing a deprecated delegated method")
def score(self, X):
"""This is available only if delegate has score.
Parameters
---------
y : ndarray
Parameter y
"""
@available_if(lambda self: hasattr(self.delegate, "predict_proba"))
def predict_proba(self, X):
"""This is available only if delegate has predict_proba.
Parameters
---------
X : ndarray
Parameter X
"""
return X
@deprecated("Testing deprecated function with wrong params")
def fit(self, X, y):
"""Incorrect docstring but should not be tested"""
def test_check_docstring_parameters():
pytest.importorskip(
"numpydoc",
reason="numpydoc is required to test the docstrings",
minversion="1.2.0",
)
incorrect = check_docstring_parameters(f_ok)
assert incorrect == []
incorrect = check_docstring_parameters(f_ok, ignore=["b"])
assert incorrect == []
incorrect = check_docstring_parameters(f_missing, ignore=["b"])
assert incorrect == []
with pytest.raises(RuntimeError, match="Unknown section Results"):
check_docstring_parameters(f_bad_sections)
with pytest.raises(RuntimeError, match="Unknown section Parameter"):
check_docstring_parameters(Klass.f_bad_sections)
incorrect = check_docstring_parameters(f_check_param_definition)
mock_meta = MockMetaEstimator(delegate=MockEst())
mock_meta_name = mock_meta.__class__.__name__
assert incorrect == [
(
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('a: int')"
),
(
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('b:')"
),
(
"sklearn.utils.tests.test_testing.f_check_param_definition There "
"was no space between the param name and colon ('d:int')"
),
]
messages = [
[
"In function: sklearn.utils.tests.test_testing.f_bad_order",
(
"There's a parameter name mismatch in function docstring w.r.t."
" function signature, at index 0 diff: 'b' != 'a'"
),
"Full diff:",
"- ['b', 'a']",
"+ ['a', 'b']",
],
[
"In function: "
+ "sklearn.utils.tests.test_testing.f_too_many_param_docstring",
(
"Parameters in function docstring have more items w.r.t. function"
" signature, first extra item: c"
),
"Full diff:",
"- ['a', 'b']",
"+ ['a', 'b', 'c']",
"? +++++",
],
[
"In function: sklearn.utils.tests.test_testing.f_missing",
(
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: b"
),
"Full diff:",
"- ['a', 'b']",
"+ ['a']",
],
[
"In function: sklearn.utils.tests.test_testing.Klass.f_missing",
(
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X"
),
"Full diff:",
"- ['X', 'y']",
"+ []",
],
[
"In function: "
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.predict",
(
"There's a parameter name mismatch in function docstring w.r.t."
" function signature, at index 0 diff: 'X' != 'y'"
),
"Full diff:",
"- ['X']",
"? ^",
"+ ['y']",
"? ^",
],
[
"In function: "
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}."
+ "predict_proba",
"potentially wrong underline length... ",
"Parameters ",
"--------- in ",
],
[
"In function: "
+ f"sklearn.utils.tests.test_testing.{mock_meta_name}.score",
"potentially wrong underline length... ",
"Parameters ",
"--------- in ",
],
[
"In function: " + f"sklearn.utils.tests.test_testing.{mock_meta_name}.fit",
(
"Parameters in function docstring have less items w.r.t. function"
" signature, first missing item: X"
),
"Full diff:",
"- ['X', 'y']",
"+ []",
],
]
for msg, f in zip(
messages,
[
f_bad_order,
f_too_many_param_docstring,
f_missing,
Klass.f_missing,
mock_meta.predict,
mock_meta.predict_proba,
mock_meta.score,
mock_meta.fit,
],
):
incorrect = check_docstring_parameters(f)
assert msg == incorrect, '\n"%s"\n not in \n"%s"' % (msg, incorrect)
class RegistrationCounter:
def __init__(self):
self.nb_calls = 0
def __call__(self, to_register_func):
self.nb_calls += 1
assert to_register_func.func is _delete_folder
def check_memmap(input_array, mmap_data, mmap_mode="r"):
assert isinstance(mmap_data, np.memmap)
writeable = mmap_mode != "r"
assert mmap_data.flags.writeable is writeable
np.testing.assert_array_equal(input_array, mmap_data)
def test_tempmemmap(monkeypatch):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, "register", registration_counter)
input_array = np.ones(3)
with TempMemmap(input_array) as data:
check_memmap(input_array, data)
temp_folder = os.path.dirname(data.filename)
if os.name != "nt":
assert not os.path.exists(temp_folder)
assert registration_counter.nb_calls == 1
mmap_mode = "r+"
with TempMemmap(input_array, mmap_mode=mmap_mode) as data:
check_memmap(input_array, data, mmap_mode=mmap_mode)
temp_folder = os.path.dirname(data.filename)
if os.name != "nt":
assert not os.path.exists(temp_folder)
assert registration_counter.nb_calls == 2
@pytest.mark.xfail(_IS_WASM, reason="memmap not fully supported")
def test_create_memmap_backed_data(monkeypatch):
registration_counter = RegistrationCounter()
monkeypatch.setattr(atexit, "register", registration_counter)
input_array = np.ones(3)
data = create_memmap_backed_data(input_array)
check_memmap(input_array, data)
assert registration_counter.nb_calls == 1
data, folder = create_memmap_backed_data(input_array, return_folder=True)
check_memmap(input_array, data)
assert folder == os.path.dirname(data.filename)
assert registration_counter.nb_calls == 2
mmap_mode = "r+"
data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
check_memmap(input_array, data, mmap_mode)
assert registration_counter.nb_calls == 3
input_list = [input_array, input_array + 1, input_array + 2]
mmap_data_list = create_memmap_backed_data(input_list)
for input_array, data in zip(input_list, mmap_data_list):
check_memmap(input_array, data)
assert registration_counter.nb_calls == 4
output_data, other = create_memmap_backed_data([input_array, "not-an-array"])
check_memmap(input_array, output_data)
assert other == "not-an-array"
@pytest.mark.parametrize(
"constructor_name, container_type",
[
("list", list),
("tuple", tuple),
("array", np.ndarray),
("sparse", sparse.csr_matrix),
# using `zip` will only keep the available sparse containers
# depending of the installed SciPy version
*zip(["sparse_csr", "sparse_csr_array"], CSR_CONTAINERS),
*zip(["sparse_csc", "sparse_csc_array"], CSC_CONTAINERS),
("dataframe", lambda: pytest.importorskip("pandas").DataFrame),
("series", lambda: pytest.importorskip("pandas").Series),
("index", lambda: pytest.importorskip("pandas").Index),
("slice", slice),
],
)
@pytest.mark.parametrize(
"dtype, superdtype",
[
(np.int32, np.integer),
(np.int64, np.integer),
(np.float32, np.floating),
(np.float64, np.floating),
],
)
def test_convert_container(
constructor_name,
container_type,
dtype,
superdtype,
):
"""Check that we convert the container to the right type of array with the
right data type."""
if constructor_name in ("dataframe", "polars", "series", "polars_series", "index"):
# delay the import of pandas/polars within the function to only skip this test
# instead of the whole file
container_type = container_type()
container = [0, 1]
container_converted = _convert_container(
container,
constructor_name,
dtype=dtype,
)
assert isinstance(container_converted, container_type)
if constructor_name in ("list", "tuple", "index"):
# list and tuple will use Python class dtype: int, float
# pandas index will always use high precision: np.int64 and np.float64
assert np.issubdtype(type(container_converted[0]), superdtype)
elif hasattr(container_converted, "dtype"):
assert container_converted.dtype == dtype
elif hasattr(container_converted, "dtypes"):
assert container_converted.dtypes[0] == dtype
def test_convert_container_categories_pandas():
pytest.importorskip("pandas")
df = _convert_container(
[["x"]], "dataframe", ["A"], categorical_feature_names=["A"]
)
assert df.dtypes.iloc[0] == "category"
def test_convert_container_categories_polars():
pl = pytest.importorskip("polars")
df = _convert_container([["x"]], "polars", ["A"], categorical_feature_names=["A"])
assert df.schema["A"] == pl.Categorical()
def test_convert_container_categories_pyarrow():
pa = pytest.importorskip("pyarrow")
df = _convert_container([["x"]], "pyarrow", ["A"], categorical_feature_names=["A"])
assert type(df.schema[0].type) is pa.DictionaryType
@pytest.mark.skipif(
sp_version >= parse_version("1.8"),
reason="sparse arrays are available as of scipy 1.8.0",
)
@pytest.mark.parametrize("constructor_name", ["sparse_csr_array", "sparse_csc_array"])
@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
def test_convert_container_raise_when_sparray_not_available(constructor_name, dtype):
"""Check that if we convert to sparse array but sparse array are not supported
(scipy<1.8.0), we should raise an explicit error."""
container = [0, 1]
with pytest.raises(
ValueError,
match=f"only available with scipy>=1.8.0, got {sp_version}",
):
_convert_container(container, constructor_name, dtype=dtype)
def test_raises():
# Tests for the raises context manager
# Proper type, no match
with raises(TypeError):
raise TypeError()
# Proper type, proper match
with raises(TypeError, match="how are you") as cm:
raise TypeError("hello how are you")
assert cm.raised_and_matched
# Proper type, proper match with multiple patterns
with raises(TypeError, match=["not this one", "how are you"]) as cm:
raise TypeError("hello how are you")
assert cm.raised_and_matched
# bad type, no match
with pytest.raises(ValueError, match="this will be raised"):
with raises(TypeError) as cm:
raise ValueError("this will be raised")
assert not cm.raised_and_matched
# Bad type, no match, with a err_msg
with pytest.raises(AssertionError, match="the failure message"):
with raises(TypeError, err_msg="the failure message") as cm:
raise ValueError()
assert not cm.raised_and_matched
# bad type, with match (is ignored anyway)
with pytest.raises(ValueError, match="this will be raised"):
with raises(TypeError, match="this is ignored") as cm:
raise ValueError("this will be raised")
assert not cm.raised_and_matched
# proper type but bad match
with pytest.raises(
AssertionError, match="should contain one of the following patterns"
):
with raises(TypeError, match="hello") as cm:
raise TypeError("Bad message")
assert not cm.raised_and_matched
# proper type but bad match, with err_msg
with pytest.raises(AssertionError, match="the failure message"):
with raises(TypeError, match="hello", err_msg="the failure message") as cm:
raise TypeError("Bad message")
assert not cm.raised_and_matched
# no raise with default may_pass=False
with pytest.raises(AssertionError, match="Did not raise"):
with raises(TypeError) as cm:
pass
assert not cm.raised_and_matched
# no raise with may_pass=True
with raises(TypeError, match="hello", may_pass=True) as cm:
pass # still OK
assert not cm.raised_and_matched
# Multiple exception types:
with raises((TypeError, ValueError)):
raise TypeError()
with raises((TypeError, ValueError)):
raise ValueError()
with pytest.raises(AssertionError):
with raises((TypeError, ValueError)):
pass
def test_float32_aware_assert_allclose():
# The relative tolerance for float32 inputs is 1e-4
assert_allclose(np.array([1.0 + 2e-5], dtype=np.float32), 1.0)
with pytest.raises(AssertionError):
assert_allclose(np.array([1.0 + 2e-4], dtype=np.float32), 1.0)
# The relative tolerance for other inputs is left to 1e-7 as in
# the original numpy version.
assert_allclose(np.array([1.0 + 2e-8], dtype=np.float64), 1.0)
with pytest.raises(AssertionError):
assert_allclose(np.array([1.0 + 2e-7], dtype=np.float64), 1.0)
# atol is left to 0.0 by default, even for float32
with pytest.raises(AssertionError):
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0)
assert_allclose(np.array([1e-5], dtype=np.float32), 0.0, atol=2e-5)
@pytest.mark.xfail(_IS_WASM, reason="cannot start subprocess")
def test_assert_run_python_script_without_output():
code = "x = 1"
assert_run_python_script_without_output(code)
code = "print('something to stdout')"
with pytest.raises(AssertionError, match="Expected no output"):
assert_run_python_script_without_output(code)
code = "print('something to stdout')"
with pytest.raises(
AssertionError,
match="output was not supposed to match.+got.+something to stdout",
):
assert_run_python_script_without_output(code, pattern="to.+stdout")
code = "\n".join(["import sys", "print('something to stderr', file=sys.stderr)"])
with pytest.raises(
AssertionError,
match="output was not supposed to match.+got.+something to stderr",
):
assert_run_python_script_without_output(code, pattern="to.+stderr")
@pytest.mark.parametrize(
"constructor_name",
[
"sparse_csr",
"sparse_csc",
pytest.param(
"sparse_csr_array",
marks=pytest.mark.skipif(
sp_version < parse_version("1.8"),
reason="sparse arrays are available as of scipy 1.8.0",
),
),
pytest.param(
"sparse_csc_array",
marks=pytest.mark.skipif(
sp_version < parse_version("1.8"),
reason="sparse arrays are available as of scipy 1.8.0",
),
),
],
)
def test_convert_container_sparse_to_sparse(constructor_name):
"""Non-regression test to check that we can still convert a sparse container
from a given format to another format.
"""
X_sparse = sparse.random(10, 10, density=0.1, format="csr")
_convert_container(X_sparse, constructor_name)
def check_warnings_as_errors(warning_info, warnings_as_errors):
if warning_info.action == "error" and warnings_as_errors:
with pytest.raises(warning_info.category, match=warning_info.message):
warnings.warn(
message=warning_info.message,
category=warning_info.category,
)
if warning_info.action == "ignore":
with warnings.catch_warnings(record=True) as record:
message = warning_info.message
# Special treatment when regex is used
if "Pyarrow" in message:
message = "\nPyarrow will become a required dependency"
warnings.warn(
message=message,
category=warning_info.category,
)
assert len(record) == 0 if warnings_as_errors else 1
if record:
assert str(record[0].message) == message
assert record[0].category == warning_info.category
@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
def test_sklearn_warnings_as_errors(warning_info):
warnings_as_errors = os.environ.get("SKLEARN_WARNINGS_AS_ERRORS", "0") != "0"
check_warnings_as_errors(warning_info, warnings_as_errors=warnings_as_errors)
@pytest.mark.parametrize("warning_info", _get_warnings_filters_info_list())
def test_turn_warnings_into_errors(warning_info):
with warnings.catch_warnings():
turn_warnings_into_errors()
check_warnings_as_errors(warning_info, warnings_as_errors=True)

View File

@@ -0,0 +1,25 @@
import numpy as np
import pytest
from sklearn.utils._typedefs import testing_make_array_from_typed_val
@pytest.mark.parametrize(
"type_t, value, expected_dtype",
[
("float64_t", 1.0, np.float64),
("float32_t", 1.0, np.float32),
("intp_t", 1, np.intp),
("int8_t", 1, np.int8),
("int32_t", 1, np.int32),
("int64_t", 1, np.int64),
("uint8_t", 1, np.uint8),
("uint32_t", 1, np.uint32),
("uint64_t", 1, np.uint64),
],
)
def test_types(type_t, value, expected_dtype):
"""Check that the types defined in _typedefs correspond to the expected
numpy dtypes.
"""
assert testing_make_array_from_typed_val[type_t](value).dtype == expected_dtype

View File

@@ -0,0 +1,65 @@
import string
import timeit
import pytest
from sklearn.utils._user_interface import _message_with_time, _print_elapsed_time
@pytest.mark.parametrize(
["source", "message", "is_long"],
[
("ABC", string.ascii_lowercase, False),
("ABCDEF", string.ascii_lowercase, False),
("ABC", string.ascii_lowercase * 3, True),
("ABC" * 10, string.ascii_lowercase, True),
("ABC", string.ascii_lowercase + "\u1048", False),
],
)
@pytest.mark.parametrize(
["time", "time_str"],
[
(0.2, " 0.2s"),
(20, " 20.0s"),
(2000, "33.3min"),
(20000, "333.3min"),
],
)
def test_message_with_time(source, message, is_long, time, time_str):
out = _message_with_time(source, message, time)
if is_long:
assert len(out) > 70
else:
assert len(out) == 70
assert out.startswith("[" + source + "] ")
out = out[len(source) + 3 :]
assert out.endswith(time_str)
out = out[: -len(time_str)]
assert out.endswith(", total=")
out = out[: -len(", total=")]
assert out.endswith(message)
out = out[: -len(message)]
assert out.endswith(" ")
out = out[:-1]
if is_long:
assert not out
else:
assert list(set(out)) == ["."]
@pytest.mark.parametrize(
["message", "expected"],
[
("hello", _message_with_time("ABC", "hello", 0.1) + "\n"),
("", _message_with_time("ABC", "", 0.1) + "\n"),
(None, ""),
],
)
def test_print_elapsed_time(message, expected, capsys, monkeypatch):
monkeypatch.setattr(timeit, "default_timer", lambda: 0)
with _print_elapsed_time("ABC", message):
monkeypatch.setattr(timeit, "default_timer", lambda: 0.1)
assert capsys.readouterr().out == expected

View File

@@ -0,0 +1,27 @@
import joblib
import pytest
from sklearn.utils import parallel_backend, register_parallel_backend, tosequence
# TODO(1.7): remove
def test_is_pypy_deprecated():
with pytest.warns(FutureWarning, match="IS_PYPY is deprecated"):
from sklearn.utils import IS_PYPY # noqa
# TODO(1.7): remove
def test_tosequence_deprecated():
with pytest.warns(FutureWarning, match="tosequence was deprecated in 1.5"):
tosequence([1, 2, 3])
# TODO(1.7): remove
def test_parallel_backend_deprecated():
with pytest.warns(FutureWarning, match="parallel_backend is deprecated"):
parallel_backend("loky", None)
with pytest.warns(FutureWarning, match="register_parallel_backend is deprecated"):
register_parallel_backend("a_backend", None)
del joblib.parallel.BACKENDS["a_backend"]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,25 @@
import numpy as np
import pytest
from sklearn.utils._weight_vector import (
WeightVector32,
WeightVector64,
)
@pytest.mark.parametrize(
"dtype, WeightVector",
[
(np.float32, WeightVector32),
(np.float64, WeightVector64),
],
)
def test_type_invariance(dtype, WeightVector):
"""Check the `dtype` consistency of `WeightVector`."""
weights = np.random.rand(100).astype(dtype)
average_weights = np.random.rand(100).astype(dtype)
weight_vector = WeightVector(weights, average_weights)
assert np.asarray(weight_vector.w).dtype is np.dtype(dtype)
assert np.asarray(weight_vector.aw).dtype is np.dtype(dtype)