library packages
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
"""Transformers for missing value imputation."""
|
||||
|
||||
import typing
|
||||
|
||||
from ._base import MissingIndicator, SimpleImputer
|
||||
from ._knn import KNNImputer
|
||||
|
||||
if typing.TYPE_CHECKING:
|
||||
# Avoid errors in type checkers (e.g. mypy) for experimental estimators.
|
||||
# TODO: remove this check once the estimator is no longer experimental.
|
||||
from ._iterative import IterativeImputer # noqa
|
||||
|
||||
__all__ = ["MissingIndicator", "SimpleImputer", "KNNImputer"]
|
||||
|
||||
|
||||
# TODO: remove this check once the estimator is no longer experimental.
|
||||
def __getattr__(name):
|
||||
if name == "IterativeImputer":
|
||||
raise ImportError(
|
||||
f"{name} is experimental and the API might change without any "
|
||||
"deprecation cycle. To use it, you need to explicitly import "
|
||||
"enable_iterative_imputer:\n"
|
||||
"from sklearn.experimental import enable_iterative_imputer"
|
||||
)
|
||||
raise AttributeError(f"module {__name__} has no attribute {name}")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1095
.venv/lib/python3.12/site-packages/sklearn/impute/_base.py
Normal file
1095
.venv/lib/python3.12/site-packages/sklearn/impute/_base.py
Normal file
File diff suppressed because it is too large
Load Diff
960
.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py
Normal file
960
.venv/lib/python3.12/site-packages/sklearn/impute/_iterative.py
Normal file
@@ -0,0 +1,960 @@
|
||||
import warnings
|
||||
from collections import namedtuple
|
||||
from numbers import Integral, Real
|
||||
from time import time
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
from ..base import _fit_context, clone
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..preprocessing import normalize
|
||||
from ..utils import _safe_indexing, check_array, check_random_state
|
||||
from ..utils._indexing import _safe_assign
|
||||
from ..utils._mask import _get_mask
|
||||
from ..utils._missing import is_scalar_nan
|
||||
from ..utils._param_validation import HasMethods, Interval, StrOptions
|
||||
from ..utils.metadata_routing import (
|
||||
MetadataRouter,
|
||||
MethodMapping,
|
||||
_raise_for_params,
|
||||
process_routing,
|
||||
)
|
||||
from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted
|
||||
from ._base import SimpleImputer, _BaseImputer, _check_inputs_dtype
|
||||
|
||||
_ImputerTriplet = namedtuple(
|
||||
"_ImputerTriplet", ["feat_idx", "neighbor_feat_idx", "estimator"]
|
||||
)
|
||||
|
||||
|
||||
def _assign_where(X1, X2, cond):
|
||||
"""Assign X2 to X1 where cond is True.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X1 : ndarray or dataframe of shape (n_samples, n_features)
|
||||
Data.
|
||||
|
||||
X2 : ndarray of shape (n_samples, n_features)
|
||||
Data to be assigned.
|
||||
|
||||
cond : ndarray of shape (n_samples, n_features)
|
||||
Boolean mask to assign data.
|
||||
"""
|
||||
if hasattr(X1, "mask"): # pandas dataframes
|
||||
X1.mask(cond=cond, other=X2, inplace=True)
|
||||
else: # ndarrays
|
||||
X1[cond] = X2[cond]
|
||||
|
||||
|
||||
class IterativeImputer(_BaseImputer):
|
||||
"""Multivariate imputer that estimates each feature from all the others.
|
||||
|
||||
A strategy for imputing missing values by modeling each feature with
|
||||
missing values as a function of other features in a round-robin fashion.
|
||||
|
||||
Read more in the :ref:`User Guide <iterative_imputer>`.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
.. note::
|
||||
|
||||
This estimator is still **experimental** for now: the predictions
|
||||
and the API might change without any deprecation cycle. To use it,
|
||||
you need to explicitly import `enable_iterative_imputer`::
|
||||
|
||||
>>> # explicitly require this experimental feature
|
||||
>>> from sklearn.experimental import enable_iterative_imputer # noqa
|
||||
>>> # now you can import normally from sklearn.impute
|
||||
>>> from sklearn.impute import IterativeImputer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
estimator : estimator object, default=BayesianRidge()
|
||||
The estimator to use at each step of the round-robin imputation.
|
||||
If `sample_posterior=True`, the estimator must support
|
||||
`return_std` in its `predict` method.
|
||||
|
||||
missing_values : int or np.nan, default=np.nan
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to `np.nan`, since `pd.NA` will be converted to `np.nan`.
|
||||
|
||||
sample_posterior : bool, default=False
|
||||
Whether to sample from the (Gaussian) predictive posterior of the
|
||||
fitted estimator for each imputation. Estimator must support
|
||||
`return_std` in its `predict` method if set to `True`. Set to
|
||||
`True` if using `IterativeImputer` for multiple imputations.
|
||||
|
||||
max_iter : int, default=10
|
||||
Maximum number of imputation rounds to perform before returning the
|
||||
imputations computed during the final round. A round is a single
|
||||
imputation of each feature with missing values. The stopping criterion
|
||||
is met once `max(abs(X_t - X_{t-1}))/max(abs(X[known_vals])) < tol`,
|
||||
where `X_t` is `X` at iteration `t`. Note that early stopping is only
|
||||
applied if `sample_posterior=False`.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Tolerance of the stopping condition.
|
||||
|
||||
n_nearest_features : int, default=None
|
||||
Number of other features to use to estimate the missing values of
|
||||
each feature column. Nearness between features is measured using
|
||||
the absolute correlation coefficient between each feature pair (after
|
||||
initial imputation). To ensure coverage of features throughout the
|
||||
imputation process, the neighbor features are not necessarily nearest,
|
||||
but are drawn with probability proportional to correlation for each
|
||||
imputed target feature. Can provide significant speed-up when the
|
||||
number of features is huge. If `None`, all features will be used.
|
||||
|
||||
initial_strategy : {'mean', 'median', 'most_frequent', 'constant'}, \
|
||||
default='mean'
|
||||
Which strategy to use to initialize the missing values. Same as the
|
||||
`strategy` parameter in :class:`~sklearn.impute.SimpleImputer`.
|
||||
|
||||
fill_value : str or numerical value, default=None
|
||||
When `strategy="constant"`, `fill_value` is used to replace all
|
||||
occurrences of missing_values. For string or object data types,
|
||||
`fill_value` must be a string.
|
||||
If `None`, `fill_value` will be 0 when imputing numerical
|
||||
data and "missing_value" for strings or object data types.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
imputation_order : {'ascending', 'descending', 'roman', 'arabic', \
|
||||
'random'}, default='ascending'
|
||||
The order in which the features will be imputed. Possible values:
|
||||
|
||||
- `'ascending'`: From features with fewest missing values to most.
|
||||
- `'descending'`: From features with most missing values to fewest.
|
||||
- `'roman'`: Left to right.
|
||||
- `'arabic'`: Right to left.
|
||||
- `'random'`: A random order for each round.
|
||||
|
||||
skip_complete : bool, default=False
|
||||
If `True` then features with missing values during :meth:`transform`
|
||||
which did not have any missing values during :meth:`fit` will be
|
||||
imputed with the initial imputation method only. Set to `True` if you
|
||||
have many features with no missing values at both :meth:`fit` and
|
||||
:meth:`transform` time to save compute.
|
||||
|
||||
min_value : float or array-like of shape (n_features,), default=-np.inf
|
||||
Minimum possible imputed value. Broadcast to shape `(n_features,)` if
|
||||
scalar. If array-like, expects shape `(n_features,)`, one min value for
|
||||
each feature. The default is `-np.inf`.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
Added support for array-like.
|
||||
|
||||
max_value : float or array-like of shape (n_features,), default=np.inf
|
||||
Maximum possible imputed value. Broadcast to shape `(n_features,)` if
|
||||
scalar. If array-like, expects shape `(n_features,)`, one max value for
|
||||
each feature. The default is `np.inf`.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
Added support for array-like.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity flag, controls the debug messages that are issued
|
||||
as functions are evaluated. The higher, the more verbose. Can be 0, 1,
|
||||
or 2.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
The seed of the pseudo random number generator to use. Randomizes
|
||||
selection of estimator features if `n_nearest_features` is not `None`,
|
||||
the `imputation_order` if `random`, and the sampling from posterior if
|
||||
`sample_posterior=True`. Use an integer for determinism.
|
||||
See :term:`the Glossary <random_state>`.
|
||||
|
||||
add_indicator : bool, default=False
|
||||
If `True`, a :class:`MissingIndicator` transform will stack onto output
|
||||
of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on
|
||||
the missing indicator even if there are missing values at
|
||||
transform/test time.
|
||||
|
||||
keep_empty_features : bool, default=False
|
||||
If True, features that consist exclusively of missing values when
|
||||
`fit` is called are returned in results when `transform` is called.
|
||||
The imputed value is always `0` except when
|
||||
`initial_strategy="constant"` in which case `fill_value` will be
|
||||
used instead.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Attributes
|
||||
----------
|
||||
initial_imputer_ : object of type :class:`~sklearn.impute.SimpleImputer`
|
||||
Imputer used to initialize the missing values.
|
||||
|
||||
imputation_sequence_ : list of tuples
|
||||
Each tuple has `(feat_idx, neighbor_feat_idx, estimator)`, where
|
||||
`feat_idx` is the current feature to be imputed,
|
||||
`neighbor_feat_idx` is the array of other features used to impute the
|
||||
current feature, and `estimator` is the trained estimator used for
|
||||
the imputation. Length is `self.n_features_with_missing_ *
|
||||
self.n_iter_`.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iteration rounds that occurred. Will be less than
|
||||
`self.max_iter` if early stopping criterion was reached.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_features_with_missing_ : int
|
||||
Number of features with missing values.
|
||||
|
||||
indicator_ : :class:`~sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
`None` if `add_indicator=False`.
|
||||
|
||||
random_state_ : RandomState instance
|
||||
RandomState instance that is generated either from a seed, the random
|
||||
number generator or by `np.random`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SimpleImputer : Univariate imputer for completing missing values
|
||||
with simple strategies.
|
||||
KNNImputer : Multivariate imputer that estimates missing features using
|
||||
nearest samples.
|
||||
|
||||
Notes
|
||||
-----
|
||||
To support imputation in inductive mode we store each feature's estimator
|
||||
during the :meth:`fit` phase, and predict without refitting (in order)
|
||||
during the :meth:`transform` phase.
|
||||
|
||||
Features which contain all missing values at :meth:`fit` are discarded upon
|
||||
:meth:`transform`.
|
||||
|
||||
Using defaults, the imputer scales in :math:`\\mathcal{O}(knp^3\\min(n,p))`
|
||||
where :math:`k` = `max_iter`, :math:`n` the number of samples and
|
||||
:math:`p` the number of features. It thus becomes prohibitively costly when
|
||||
the number of features increases. Setting
|
||||
`n_nearest_features << n_features`, `skip_complete=True` or increasing `tol`
|
||||
can help to reduce its computational cost.
|
||||
|
||||
Depending on the nature of missing values, simple imputers can be
|
||||
preferable in a prediction context.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] `Stef van Buuren, Karin Groothuis-Oudshoorn (2011). "mice:
|
||||
Multivariate Imputation by Chained Equations in R". Journal of
|
||||
Statistical Software 45: 1-67.
|
||||
<https://www.jstatsoft.org/article/view/v045i03>`_
|
||||
|
||||
.. [2] `S. F. Buck, (1960). "A Method of Estimation of Missing Values in
|
||||
Multivariate Data Suitable for use with an Electronic Computer".
|
||||
Journal of the Royal Statistical Society 22(2): 302-306.
|
||||
<https://www.jstor.org/stable/2984099>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.experimental import enable_iterative_imputer
|
||||
>>> from sklearn.impute import IterativeImputer
|
||||
>>> imp_mean = IterativeImputer(random_state=0)
|
||||
>>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
|
||||
IterativeImputer(random_state=0)
|
||||
>>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
|
||||
>>> imp_mean.transform(X)
|
||||
array([[ 6.9584..., 2. , 3. ],
|
||||
[ 4. , 2.6000..., 6. ],
|
||||
[10. , 4.9999..., 9. ]])
|
||||
|
||||
For a more detailed example see
|
||||
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py` or
|
||||
:ref:`sphx_glr_auto_examples_impute_plot_iterative_imputer_variants_comparison.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_BaseImputer._parameter_constraints,
|
||||
"estimator": [None, HasMethods(["fit", "predict"])],
|
||||
"sample_posterior": ["boolean"],
|
||||
"max_iter": [Interval(Integral, 0, None, closed="left")],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"n_nearest_features": [None, Interval(Integral, 1, None, closed="left")],
|
||||
"initial_strategy": [
|
||||
StrOptions({"mean", "median", "most_frequent", "constant"})
|
||||
],
|
||||
"fill_value": "no_validation", # any object is valid
|
||||
"imputation_order": [
|
||||
StrOptions({"ascending", "descending", "roman", "arabic", "random"})
|
||||
],
|
||||
"skip_complete": ["boolean"],
|
||||
"min_value": [None, Interval(Real, None, None, closed="both"), "array-like"],
|
||||
"max_value": [None, Interval(Real, None, None, closed="both"), "array-like"],
|
||||
"verbose": ["verbose"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
estimator=None,
|
||||
*,
|
||||
missing_values=np.nan,
|
||||
sample_posterior=False,
|
||||
max_iter=10,
|
||||
tol=1e-3,
|
||||
n_nearest_features=None,
|
||||
initial_strategy="mean",
|
||||
fill_value=None,
|
||||
imputation_order="ascending",
|
||||
skip_complete=False,
|
||||
min_value=-np.inf,
|
||||
max_value=np.inf,
|
||||
verbose=0,
|
||||
random_state=None,
|
||||
add_indicator=False,
|
||||
keep_empty_features=False,
|
||||
):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator,
|
||||
keep_empty_features=keep_empty_features,
|
||||
)
|
||||
|
||||
self.estimator = estimator
|
||||
self.sample_posterior = sample_posterior
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.n_nearest_features = n_nearest_features
|
||||
self.initial_strategy = initial_strategy
|
||||
self.fill_value = fill_value
|
||||
self.imputation_order = imputation_order
|
||||
self.skip_complete = skip_complete
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
|
||||
def _impute_one_feature(
|
||||
self,
|
||||
X_filled,
|
||||
mask_missing_values,
|
||||
feat_idx,
|
||||
neighbor_feat_idx,
|
||||
estimator=None,
|
||||
fit_mode=True,
|
||||
params=None,
|
||||
):
|
||||
"""Impute a single feature from the others provided.
|
||||
|
||||
This function predicts the missing values of one of the features using
|
||||
the current estimates of all the other features. The `estimator` must
|
||||
support `return_std=True` in its `predict` method for this function
|
||||
to work.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_filled : ndarray
|
||||
Input data with the most recent imputations.
|
||||
|
||||
mask_missing_values : ndarray
|
||||
Input data's missing indicator matrix.
|
||||
|
||||
feat_idx : int
|
||||
Index of the feature currently being imputed.
|
||||
|
||||
neighbor_feat_idx : ndarray
|
||||
Indices of the features to be used in imputing `feat_idx`.
|
||||
|
||||
estimator : object
|
||||
The estimator to use at this step of the round-robin imputation.
|
||||
If `sample_posterior=True`, the estimator must support
|
||||
`return_std` in its `predict` method.
|
||||
If None, it will be cloned from self._estimator.
|
||||
|
||||
fit_mode : boolean, default=True
|
||||
Whether to fit and predict with the estimator or just predict.
|
||||
|
||||
params : dict
|
||||
Additional params routed to the individual estimator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_filled : ndarray
|
||||
Input data with `X_filled[missing_row_mask, feat_idx]` updated.
|
||||
|
||||
estimator : estimator with sklearn API
|
||||
The fitted estimator used to impute
|
||||
`X_filled[missing_row_mask, feat_idx]`.
|
||||
"""
|
||||
if estimator is None and fit_mode is False:
|
||||
raise ValueError(
|
||||
"If fit_mode is False, then an already-fitted "
|
||||
"estimator should be passed in."
|
||||
)
|
||||
|
||||
if estimator is None:
|
||||
estimator = clone(self._estimator)
|
||||
|
||||
missing_row_mask = mask_missing_values[:, feat_idx]
|
||||
if fit_mode:
|
||||
X_train = _safe_indexing(
|
||||
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
|
||||
~missing_row_mask,
|
||||
axis=0,
|
||||
)
|
||||
y_train = _safe_indexing(
|
||||
_safe_indexing(X_filled, feat_idx, axis=1),
|
||||
~missing_row_mask,
|
||||
axis=0,
|
||||
)
|
||||
estimator.fit(X_train, y_train, **params)
|
||||
|
||||
# if no missing values, don't predict
|
||||
if np.sum(missing_row_mask) == 0:
|
||||
return X_filled, estimator
|
||||
|
||||
# get posterior samples if there is at least one missing value
|
||||
X_test = _safe_indexing(
|
||||
_safe_indexing(X_filled, neighbor_feat_idx, axis=1),
|
||||
missing_row_mask,
|
||||
axis=0,
|
||||
)
|
||||
if self.sample_posterior:
|
||||
mus, sigmas = estimator.predict(X_test, return_std=True)
|
||||
imputed_values = np.zeros(mus.shape, dtype=X_filled.dtype)
|
||||
# two types of problems: (1) non-positive sigmas
|
||||
# (2) mus outside legal range of min_value and max_value
|
||||
# (results in inf sample)
|
||||
positive_sigmas = sigmas > 0
|
||||
imputed_values[~positive_sigmas] = mus[~positive_sigmas]
|
||||
mus_too_low = mus < self._min_value[feat_idx]
|
||||
imputed_values[mus_too_low] = self._min_value[feat_idx]
|
||||
mus_too_high = mus > self._max_value[feat_idx]
|
||||
imputed_values[mus_too_high] = self._max_value[feat_idx]
|
||||
# the rest can be sampled without statistical issues
|
||||
inrange_mask = positive_sigmas & ~mus_too_low & ~mus_too_high
|
||||
mus = mus[inrange_mask]
|
||||
sigmas = sigmas[inrange_mask]
|
||||
a = (self._min_value[feat_idx] - mus) / sigmas
|
||||
b = (self._max_value[feat_idx] - mus) / sigmas
|
||||
|
||||
truncated_normal = stats.truncnorm(a=a, b=b, loc=mus, scale=sigmas)
|
||||
imputed_values[inrange_mask] = truncated_normal.rvs(
|
||||
random_state=self.random_state_
|
||||
)
|
||||
else:
|
||||
imputed_values = estimator.predict(X_test)
|
||||
imputed_values = np.clip(
|
||||
imputed_values, self._min_value[feat_idx], self._max_value[feat_idx]
|
||||
)
|
||||
|
||||
# update the feature
|
||||
_safe_assign(
|
||||
X_filled,
|
||||
imputed_values,
|
||||
row_indexer=missing_row_mask,
|
||||
column_indexer=feat_idx,
|
||||
)
|
||||
return X_filled, estimator
|
||||
|
||||
def _get_neighbor_feat_idx(self, n_features, feat_idx, abs_corr_mat):
|
||||
"""Get a list of other features to predict `feat_idx`.
|
||||
|
||||
If `self.n_nearest_features` is less than or equal to the total
|
||||
number of features, then use a probability proportional to the absolute
|
||||
correlation between `feat_idx` and each other feature to randomly
|
||||
choose a subsample of the other features (without replacement).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_features : int
|
||||
Number of features in `X`.
|
||||
|
||||
feat_idx : int
|
||||
Index of the feature currently being imputed.
|
||||
|
||||
abs_corr_mat : ndarray, shape (n_features, n_features)
|
||||
Absolute correlation matrix of `X`. The diagonal has been zeroed
|
||||
out and each feature has been normalized to sum to 1. Can be None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
neighbor_feat_idx : array-like
|
||||
The features to use to impute `feat_idx`.
|
||||
"""
|
||||
if self.n_nearest_features is not None and self.n_nearest_features < n_features:
|
||||
p = abs_corr_mat[:, feat_idx]
|
||||
neighbor_feat_idx = self.random_state_.choice(
|
||||
np.arange(n_features), self.n_nearest_features, replace=False, p=p
|
||||
)
|
||||
else:
|
||||
inds_left = np.arange(feat_idx)
|
||||
inds_right = np.arange(feat_idx + 1, n_features)
|
||||
neighbor_feat_idx = np.concatenate((inds_left, inds_right))
|
||||
return neighbor_feat_idx
|
||||
|
||||
def _get_ordered_idx(self, mask_missing_values):
|
||||
"""Decide in what order we will update the features.
|
||||
|
||||
As a homage to the MICE R package, we will have 4 main options of
|
||||
how to order the updates, and use a random order if anything else
|
||||
is specified.
|
||||
|
||||
Also, this function skips features which have no missing values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask_missing_values : array-like, shape (n_samples, n_features)
|
||||
Input data's missing indicator matrix, where `n_samples` is the
|
||||
number of samples and `n_features` is the number of features.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ordered_idx : ndarray, shape (n_features,)
|
||||
The order in which to impute the features.
|
||||
"""
|
||||
frac_of_missing_values = mask_missing_values.mean(axis=0)
|
||||
if self.skip_complete:
|
||||
missing_values_idx = np.flatnonzero(frac_of_missing_values)
|
||||
else:
|
||||
missing_values_idx = np.arange(np.shape(frac_of_missing_values)[0])
|
||||
if self.imputation_order == "roman":
|
||||
ordered_idx = missing_values_idx
|
||||
elif self.imputation_order == "arabic":
|
||||
ordered_idx = missing_values_idx[::-1]
|
||||
elif self.imputation_order == "ascending":
|
||||
n = len(frac_of_missing_values) - len(missing_values_idx)
|
||||
ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:]
|
||||
elif self.imputation_order == "descending":
|
||||
n = len(frac_of_missing_values) - len(missing_values_idx)
|
||||
ordered_idx = np.argsort(frac_of_missing_values, kind="mergesort")[n:][::-1]
|
||||
elif self.imputation_order == "random":
|
||||
ordered_idx = missing_values_idx
|
||||
self.random_state_.shuffle(ordered_idx)
|
||||
return ordered_idx
|
||||
|
||||
def _get_abs_corr_mat(self, X_filled, tolerance=1e-6):
|
||||
"""Get absolute correlation matrix between features.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_filled : ndarray, shape (n_samples, n_features)
|
||||
Input data with the most recent imputations.
|
||||
|
||||
tolerance : float, default=1e-6
|
||||
`abs_corr_mat` can have nans, which will be replaced
|
||||
with `tolerance`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
abs_corr_mat : ndarray, shape (n_features, n_features)
|
||||
Absolute correlation matrix of `X` at the beginning of the
|
||||
current round. The diagonal has been zeroed out and each feature's
|
||||
absolute correlations with all others have been normalized to sum
|
||||
to 1.
|
||||
"""
|
||||
n_features = X_filled.shape[1]
|
||||
if self.n_nearest_features is None or self.n_nearest_features >= n_features:
|
||||
return None
|
||||
with np.errstate(invalid="ignore"):
|
||||
# if a feature in the neighborhood has only a single value
|
||||
# (e.g., categorical feature), the std. dev. will be null and
|
||||
# np.corrcoef will raise a warning due to a division by zero
|
||||
abs_corr_mat = np.abs(np.corrcoef(X_filled.T))
|
||||
# np.corrcoef is not defined for features with zero std
|
||||
abs_corr_mat[np.isnan(abs_corr_mat)] = tolerance
|
||||
# ensures exploration, i.e. at least some probability of sampling
|
||||
np.clip(abs_corr_mat, tolerance, None, out=abs_corr_mat)
|
||||
# features are not their own neighbors
|
||||
np.fill_diagonal(abs_corr_mat, 0)
|
||||
# needs to sum to 1 for np.random.choice sampling
|
||||
abs_corr_mat = normalize(abs_corr_mat, norm="l1", axis=0, copy=False)
|
||||
return abs_corr_mat
|
||||
|
||||
def _initial_imputation(self, X, in_fit=False):
|
||||
"""Perform initial imputation for input `X`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
in_fit : bool, default=False
|
||||
Whether function is called in :meth:`fit`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : ndarray of shape (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
X_filled : ndarray of shape (n_samples, n_features)
|
||||
Input data with the most recent imputations.
|
||||
|
||||
mask_missing_values : ndarray of shape (n_samples, n_features)
|
||||
Input data's missing indicator matrix, where `n_samples` is the
|
||||
number of samples and `n_features` is the number of features,
|
||||
masked by non-missing features.
|
||||
|
||||
X_missing_mask : ndarray, shape (n_samples, n_features)
|
||||
Input data's mask matrix indicating missing datapoints, where
|
||||
`n_samples` is the number of samples and `n_features` is the
|
||||
number of features.
|
||||
"""
|
||||
if is_scalar_nan(self.missing_values):
|
||||
force_all_finite = "allow-nan"
|
||||
else:
|
||||
force_all_finite = True
|
||||
|
||||
X = self._validate_data(
|
||||
X,
|
||||
dtype=FLOAT_DTYPES,
|
||||
order="F",
|
||||
reset=in_fit,
|
||||
force_all_finite=force_all_finite,
|
||||
)
|
||||
_check_inputs_dtype(X, self.missing_values)
|
||||
|
||||
X_missing_mask = _get_mask(X, self.missing_values)
|
||||
mask_missing_values = X_missing_mask.copy()
|
||||
if self.initial_imputer_ is None:
|
||||
self.initial_imputer_ = SimpleImputer(
|
||||
missing_values=self.missing_values,
|
||||
strategy=self.initial_strategy,
|
||||
fill_value=self.fill_value,
|
||||
keep_empty_features=self.keep_empty_features,
|
||||
).set_output(transform="default")
|
||||
X_filled = self.initial_imputer_.fit_transform(X)
|
||||
else:
|
||||
X_filled = self.initial_imputer_.transform(X)
|
||||
|
||||
valid_mask = np.flatnonzero(
|
||||
np.logical_not(np.isnan(self.initial_imputer_.statistics_))
|
||||
)
|
||||
|
||||
if not self.keep_empty_features:
|
||||
# drop empty features
|
||||
Xt = X[:, valid_mask]
|
||||
mask_missing_values = mask_missing_values[:, valid_mask]
|
||||
else:
|
||||
# mark empty features as not missing and keep the original
|
||||
# imputation
|
||||
mask_missing_values[:, valid_mask] = True
|
||||
Xt = X
|
||||
|
||||
return Xt, X_filled, mask_missing_values, X_missing_mask
|
||||
|
||||
@staticmethod
|
||||
def _validate_limit(limit, limit_type, n_features):
|
||||
"""Validate the limits (min/max) of the feature values.
|
||||
|
||||
Converts scalar min/max limits to vectors of shape `(n_features,)`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
limit: scalar or array-like
|
||||
The user-specified limit (i.e, min_value or max_value).
|
||||
limit_type: {'max', 'min'}
|
||||
Type of limit to validate.
|
||||
n_features: int
|
||||
Number of features in the dataset.
|
||||
|
||||
Returns
|
||||
-------
|
||||
limit: ndarray, shape(n_features,)
|
||||
Array of limits, one for each feature.
|
||||
"""
|
||||
limit_bound = np.inf if limit_type == "max" else -np.inf
|
||||
limit = limit_bound if limit is None else limit
|
||||
if np.isscalar(limit):
|
||||
limit = np.full(n_features, limit)
|
||||
limit = check_array(limit, force_all_finite=False, copy=False, ensure_2d=False)
|
||||
if not limit.shape[0] == n_features:
|
||||
raise ValueError(
|
||||
f"'{limit_type}_value' should be of "
|
||||
f"shape ({n_features},) when an array-like "
|
||||
f"is provided. Got {limit.shape}, instead."
|
||||
)
|
||||
return limit
|
||||
|
||||
@_fit_context(
|
||||
# IterativeImputer.estimator is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit_transform(self, X, y=None, **params):
|
||||
"""Fit the imputer on `X` and return the transformed `X`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
**params : dict
|
||||
Parameters routed to the `fit` method of the sub-estimator via the
|
||||
metadata routing API.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
Only available if
|
||||
`sklearn.set_config(enable_metadata_routing=True)` is set. See
|
||||
:ref:`Metadata Routing User Guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : array-like, shape (n_samples, n_features)
|
||||
The imputed input data.
|
||||
"""
|
||||
_raise_for_params(params, self, "fit")
|
||||
|
||||
routed_params = process_routing(
|
||||
self,
|
||||
"fit",
|
||||
**params,
|
||||
)
|
||||
|
||||
self.random_state_ = getattr(
|
||||
self, "random_state_", check_random_state(self.random_state)
|
||||
)
|
||||
|
||||
if self.estimator is None:
|
||||
from ..linear_model import BayesianRidge
|
||||
|
||||
self._estimator = BayesianRidge()
|
||||
else:
|
||||
self._estimator = clone(self.estimator)
|
||||
|
||||
self.imputation_sequence_ = []
|
||||
|
||||
self.initial_imputer_ = None
|
||||
|
||||
X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
|
||||
X, in_fit=True
|
||||
)
|
||||
|
||||
super()._fit_indicator(complete_mask)
|
||||
X_indicator = super()._transform_indicator(complete_mask)
|
||||
|
||||
if self.max_iter == 0 or np.all(mask_missing_values):
|
||||
self.n_iter_ = 0
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
# Edge case: a single feature, we return the initial imputation.
|
||||
if Xt.shape[1] == 1:
|
||||
self.n_iter_ = 0
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
self._min_value = self._validate_limit(self.min_value, "min", X.shape[1])
|
||||
self._max_value = self._validate_limit(self.max_value, "max", X.shape[1])
|
||||
|
||||
if not np.all(np.greater(self._max_value, self._min_value)):
|
||||
raise ValueError("One (or more) features have min_value >= max_value.")
|
||||
|
||||
# order in which to impute
|
||||
# note this is probably too slow for large feature data (d > 100000)
|
||||
# and a better way would be good.
|
||||
# see: https://goo.gl/KyCNwj and subsequent comments
|
||||
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
||||
self.n_features_with_missing_ = len(ordered_idx)
|
||||
|
||||
abs_corr_mat = self._get_abs_corr_mat(Xt)
|
||||
|
||||
n_samples, n_features = Xt.shape
|
||||
if self.verbose > 0:
|
||||
print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,))
|
||||
start_t = time()
|
||||
if not self.sample_posterior:
|
||||
Xt_previous = Xt.copy()
|
||||
normalized_tol = self.tol * np.max(np.abs(X[~mask_missing_values]))
|
||||
for self.n_iter_ in range(1, self.max_iter + 1):
|
||||
if self.imputation_order == "random":
|
||||
ordered_idx = self._get_ordered_idx(mask_missing_values)
|
||||
|
||||
for feat_idx in ordered_idx:
|
||||
neighbor_feat_idx = self._get_neighbor_feat_idx(
|
||||
n_features, feat_idx, abs_corr_mat
|
||||
)
|
||||
Xt, estimator = self._impute_one_feature(
|
||||
Xt,
|
||||
mask_missing_values,
|
||||
feat_idx,
|
||||
neighbor_feat_idx,
|
||||
estimator=None,
|
||||
fit_mode=True,
|
||||
params=routed_params.estimator.fit,
|
||||
)
|
||||
estimator_triplet = _ImputerTriplet(
|
||||
feat_idx, neighbor_feat_idx, estimator
|
||||
)
|
||||
self.imputation_sequence_.append(estimator_triplet)
|
||||
|
||||
if self.verbose > 1:
|
||||
print(
|
||||
"[IterativeImputer] Ending imputation round "
|
||||
"%d/%d, elapsed time %0.2f"
|
||||
% (self.n_iter_, self.max_iter, time() - start_t)
|
||||
)
|
||||
|
||||
if not self.sample_posterior:
|
||||
inf_norm = np.linalg.norm(Xt - Xt_previous, ord=np.inf, axis=None)
|
||||
if self.verbose > 0:
|
||||
print(
|
||||
"[IterativeImputer] Change: {}, scaled tolerance: {} ".format(
|
||||
inf_norm, normalized_tol
|
||||
)
|
||||
)
|
||||
if inf_norm < normalized_tol:
|
||||
if self.verbose > 0:
|
||||
print("[IterativeImputer] Early stopping criterion reached.")
|
||||
break
|
||||
Xt_previous = Xt.copy()
|
||||
else:
|
||||
if not self.sample_posterior:
|
||||
warnings.warn(
|
||||
"[IterativeImputer] Early stopping criterion not reached.",
|
||||
ConvergenceWarning,
|
||||
)
|
||||
_assign_where(Xt, X, cond=~mask_missing_values)
|
||||
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
def transform(self, X):
|
||||
"""Impute all missing values in `X`.
|
||||
|
||||
Note that this is stochastic, and that if `random_state` is not fixed,
|
||||
repeated calls, or permuted input, results will differ.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : array-like, shape (n_samples, n_features)
|
||||
The imputed input data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X, Xt, mask_missing_values, complete_mask = self._initial_imputation(
|
||||
X, in_fit=False
|
||||
)
|
||||
|
||||
X_indicator = super()._transform_indicator(complete_mask)
|
||||
|
||||
if self.n_iter_ == 0 or np.all(mask_missing_values):
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
imputations_per_round = len(self.imputation_sequence_) // self.n_iter_
|
||||
i_rnd = 0
|
||||
if self.verbose > 0:
|
||||
print("[IterativeImputer] Completing matrix with shape %s" % (X.shape,))
|
||||
start_t = time()
|
||||
for it, estimator_triplet in enumerate(self.imputation_sequence_):
|
||||
Xt, _ = self._impute_one_feature(
|
||||
Xt,
|
||||
mask_missing_values,
|
||||
estimator_triplet.feat_idx,
|
||||
estimator_triplet.neighbor_feat_idx,
|
||||
estimator=estimator_triplet.estimator,
|
||||
fit_mode=False,
|
||||
)
|
||||
if not (it + 1) % imputations_per_round:
|
||||
if self.verbose > 1:
|
||||
print(
|
||||
"[IterativeImputer] Ending imputation round "
|
||||
"%d/%d, elapsed time %0.2f"
|
||||
% (i_rnd + 1, self.n_iter_, time() - start_t)
|
||||
)
|
||||
i_rnd += 1
|
||||
|
||||
_assign_where(Xt, X, cond=~mask_missing_values)
|
||||
|
||||
return super()._concatenate_indicator(Xt, X_indicator)
|
||||
|
||||
def fit(self, X, y=None, **fit_params):
|
||||
"""Fit the imputer on `X` and return self.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
**fit_params : dict
|
||||
Parameters routed to the `fit` method of the sub-estimator via the
|
||||
metadata routing API.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
Only available if
|
||||
`sklearn.set_config(enable_metadata_routing=True)` is set. See
|
||||
:ref:`Metadata Routing User Guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_transform(X, **fit_params)
|
||||
return self
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then the following input feature names are generated:
|
||||
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
names = self.initial_imputer_.get_feature_names_out(input_features)
|
||||
return self._concatenate_indicator_feature_names_out(names, input_features)
|
||||
|
||||
def get_metadata_routing(self):
|
||||
"""Get metadata routing of this object.
|
||||
|
||||
Please check :ref:`User Guide <metadata_routing>` on how the routing
|
||||
mechanism works.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Returns
|
||||
-------
|
||||
routing : MetadataRouter
|
||||
A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating
|
||||
routing information.
|
||||
"""
|
||||
router = MetadataRouter(owner=self.__class__.__name__).add(
|
||||
estimator=self.estimator,
|
||||
method_mapping=MethodMapping().add(callee="fit", caller="fit"),
|
||||
)
|
||||
return router
|
||||
402
.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py
Normal file
402
.venv/lib/python3.12/site-packages/sklearn/impute/_knn.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# Authors: Ashim Bhattarai <ashimb9@gmail.com>
|
||||
# Thomas J Fan <thomasjpfan@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..base import _fit_context
|
||||
from ..metrics import pairwise_distances_chunked
|
||||
from ..metrics.pairwise import _NAN_METRICS
|
||||
from ..neighbors._base import _get_weights
|
||||
from ..utils._mask import _get_mask
|
||||
from ..utils._missing import is_scalar_nan
|
||||
from ..utils._param_validation import Hidden, Interval, StrOptions
|
||||
from ..utils.validation import FLOAT_DTYPES, _check_feature_names_in, check_is_fitted
|
||||
from ._base import _BaseImputer
|
||||
|
||||
|
||||
class KNNImputer(_BaseImputer):
|
||||
"""Imputation for completing missing values using k-Nearest Neighbors.
|
||||
|
||||
Each sample's missing values are imputed using the mean value from
|
||||
`n_neighbors` nearest neighbors found in the training set. Two samples are
|
||||
close if the features that neither is missing are close.
|
||||
|
||||
Read more in the :ref:`User Guide <knnimpute>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
missing_values : int, float, str, np.nan or None, default=np.nan
|
||||
The placeholder for the missing values. All occurrences of
|
||||
`missing_values` will be imputed. For pandas' dataframes with
|
||||
nullable integer dtypes with missing values, `missing_values`
|
||||
should be set to np.nan, since `pd.NA` will be converted to np.nan.
|
||||
|
||||
n_neighbors : int, default=5
|
||||
Number of neighboring samples to use for imputation.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
Weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood are
|
||||
weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- callable : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
metric : {'nan_euclidean'} or callable, default='nan_euclidean'
|
||||
Distance metric for searching neighbors. Possible values:
|
||||
|
||||
- 'nan_euclidean'
|
||||
- callable : a user-defined function which conforms to the definition
|
||||
of ``func_metric(x, y, *, missing_values=np.nan)``. `x` and `y`
|
||||
corresponds to a row (i.e. 1-D arrays) of `X` and `Y`, respectively.
|
||||
The callable should returns a scalar distance value.
|
||||
|
||||
copy : bool, default=True
|
||||
If True, a copy of X will be created. If False, imputation will
|
||||
be done in-place whenever possible.
|
||||
|
||||
add_indicator : bool, default=False
|
||||
If True, a :class:`MissingIndicator` transform will stack onto the
|
||||
output of the imputer's transform. This allows a predictive estimator
|
||||
to account for missingness despite imputation. If a feature has no
|
||||
missing values at fit/train time, the feature won't appear on the
|
||||
missing indicator even if there are missing values at transform/test
|
||||
time.
|
||||
|
||||
keep_empty_features : bool, default=False
|
||||
If True, features that consist exclusively of missing values when
|
||||
`fit` is called are returned in results when `transform` is called.
|
||||
The imputed value is always `0`.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Attributes
|
||||
----------
|
||||
indicator_ : :class:`~sklearn.impute.MissingIndicator`
|
||||
Indicator used to add binary indicators for missing values.
|
||||
``None`` if add_indicator is False.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SimpleImputer : Univariate imputer for completing missing values
|
||||
with simple strategies.
|
||||
IterativeImputer : Multivariate imputer that estimates values to impute for
|
||||
each feature with missing values from all the others.
|
||||
|
||||
References
|
||||
----------
|
||||
* `Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor
|
||||
Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing
|
||||
value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17
|
||||
no. 6, 2001 Pages 520-525.
|
||||
<https://academic.oup.com/bioinformatics/article/17/6/520/272365>`_
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.impute import KNNImputer
|
||||
>>> X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
|
||||
>>> imputer = KNNImputer(n_neighbors=2)
|
||||
>>> imputer.fit_transform(X)
|
||||
array([[1. , 2. , 4. ],
|
||||
[3. , 4. , 3. ],
|
||||
[5.5, 6. , 5. ],
|
||||
[8. , 8. , 7. ]])
|
||||
|
||||
For a more detailed example see
|
||||
:ref:`sphx_glr_auto_examples_impute_plot_missing_values.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_BaseImputer._parameter_constraints,
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"weights": [StrOptions({"uniform", "distance"}), callable, Hidden(None)],
|
||||
"metric": [StrOptions(set(_NAN_METRICS)), callable],
|
||||
"copy": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
missing_values=np.nan,
|
||||
n_neighbors=5,
|
||||
weights="uniform",
|
||||
metric="nan_euclidean",
|
||||
copy=True,
|
||||
add_indicator=False,
|
||||
keep_empty_features=False,
|
||||
):
|
||||
super().__init__(
|
||||
missing_values=missing_values,
|
||||
add_indicator=add_indicator,
|
||||
keep_empty_features=keep_empty_features,
|
||||
)
|
||||
self.n_neighbors = n_neighbors
|
||||
self.weights = weights
|
||||
self.metric = metric
|
||||
self.copy = copy
|
||||
|
||||
def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
|
||||
"""Helper function to impute a single column.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dist_pot_donors : ndarray of shape (n_receivers, n_potential_donors)
|
||||
Distance matrix between the receivers and potential donors from
|
||||
training set. There must be at least one non-nan distance between
|
||||
a receiver and a potential donor.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors to consider.
|
||||
|
||||
fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Column of potential donors from training set.
|
||||
|
||||
mask_fit_X_col : ndarray of shape (n_potential_donors,)
|
||||
Missing mask for fit_X_col.
|
||||
|
||||
Returns
|
||||
-------
|
||||
imputed_values: ndarray of shape (n_receivers,)
|
||||
Imputed values for receiver.
|
||||
"""
|
||||
# Get donors
|
||||
donors_idx = np.argpartition(dist_pot_donors, n_neighbors - 1, axis=1)[
|
||||
:, :n_neighbors
|
||||
]
|
||||
|
||||
# Get weight matrix from distance matrix
|
||||
donors_dist = dist_pot_donors[
|
||||
np.arange(donors_idx.shape[0])[:, None], donors_idx
|
||||
]
|
||||
|
||||
weight_matrix = _get_weights(donors_dist, self.weights)
|
||||
|
||||
# fill nans with zeros
|
||||
if weight_matrix is not None:
|
||||
weight_matrix[np.isnan(weight_matrix)] = 0.0
|
||||
|
||||
# Retrieve donor values and calculate kNN average
|
||||
donors = fit_X_col.take(donors_idx)
|
||||
donors_mask = mask_fit_X_col.take(donors_idx)
|
||||
donors = np.ma.array(donors, mask=donors_mask)
|
||||
|
||||
return np.ma.average(donors, axis=1, weights=weight_matrix).data
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the imputer on X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like shape of (n_samples, n_features)
|
||||
Input data, where `n_samples` is the number of samples and
|
||||
`n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
The fitted `KNNImputer` class instance.
|
||||
"""
|
||||
# Check data integrity and calling arguments
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse=False,
|
||||
dtype=FLOAT_DTYPES,
|
||||
force_all_finite=force_all_finite,
|
||||
copy=self.copy,
|
||||
)
|
||||
|
||||
self._fit_X = X
|
||||
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
|
||||
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)
|
||||
|
||||
super()._fit_indicator(self._mask_fit_X)
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Impute all missing values in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The input data to complete.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array-like of shape (n_samples, n_output_features)
|
||||
The imputed dataset. `n_output_features` is the number of features
|
||||
that is not always missing during `fit`.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
if not is_scalar_nan(self.missing_values):
|
||||
force_all_finite = True
|
||||
else:
|
||||
force_all_finite = "allow-nan"
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse=False,
|
||||
dtype=FLOAT_DTYPES,
|
||||
force_writeable=True,
|
||||
force_all_finite=force_all_finite,
|
||||
copy=self.copy,
|
||||
reset=False,
|
||||
)
|
||||
|
||||
mask = _get_mask(X, self.missing_values)
|
||||
mask_fit_X = self._mask_fit_X
|
||||
valid_mask = self._valid_mask
|
||||
|
||||
X_indicator = super()._transform_indicator(mask)
|
||||
|
||||
# Removes columns where the training data is all nan
|
||||
if not np.any(mask):
|
||||
# No missing values in X
|
||||
if self.keep_empty_features:
|
||||
Xc = X
|
||||
Xc[:, ~valid_mask] = 0
|
||||
else:
|
||||
Xc = X[:, valid_mask]
|
||||
|
||||
# Even if there are no missing values in X, we still concatenate Xc
|
||||
# with the missing value indicator matrix, X_indicator.
|
||||
# This is to ensure that the output maintains consistency in terms
|
||||
# of columns, regardless of whether missing values exist in X or not.
|
||||
return super()._concatenate_indicator(Xc, X_indicator)
|
||||
|
||||
row_missing_idx = np.flatnonzero(mask.any(axis=1))
|
||||
|
||||
non_missing_fix_X = np.logical_not(mask_fit_X)
|
||||
|
||||
# Maps from indices from X to indices in dist matrix
|
||||
dist_idx_map = np.zeros(X.shape[0], dtype=int)
|
||||
dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
|
||||
|
||||
def process_chunk(dist_chunk, start):
|
||||
row_missing_chunk = row_missing_idx[start : start + len(dist_chunk)]
|
||||
|
||||
# Find and impute missing by column
|
||||
for col in range(X.shape[1]):
|
||||
if not valid_mask[col]:
|
||||
# column was all missing during training
|
||||
continue
|
||||
|
||||
col_mask = mask[row_missing_chunk, col]
|
||||
if not np.any(col_mask):
|
||||
# column has no missing values
|
||||
continue
|
||||
|
||||
(potential_donors_idx,) = np.nonzero(non_missing_fix_X[:, col])
|
||||
|
||||
# receivers_idx are indices in X
|
||||
receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
|
||||
|
||||
# distances for samples that needed imputation for column
|
||||
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
||||
:, potential_donors_idx
|
||||
]
|
||||
|
||||
# receivers with all nan distances impute with mean
|
||||
all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
|
||||
all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
|
||||
|
||||
if all_nan_receivers_idx.size:
|
||||
col_mean = np.ma.array(
|
||||
self._fit_X[:, col], mask=mask_fit_X[:, col]
|
||||
).mean()
|
||||
X[all_nan_receivers_idx, col] = col_mean
|
||||
|
||||
if len(all_nan_receivers_idx) == len(receivers_idx):
|
||||
# all receivers imputed with mean
|
||||
continue
|
||||
|
||||
# receivers with at least one defined distance
|
||||
receivers_idx = receivers_idx[~all_nan_dist_mask]
|
||||
dist_subset = dist_chunk[dist_idx_map[receivers_idx] - start][
|
||||
:, potential_donors_idx
|
||||
]
|
||||
|
||||
n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
|
||||
value = self._calc_impute(
|
||||
dist_subset,
|
||||
n_neighbors,
|
||||
self._fit_X[potential_donors_idx, col],
|
||||
mask_fit_X[potential_donors_idx, col],
|
||||
)
|
||||
X[receivers_idx, col] = value
|
||||
|
||||
# process in fixed-memory chunks
|
||||
gen = pairwise_distances_chunked(
|
||||
X[row_missing_idx, :],
|
||||
self._fit_X,
|
||||
metric=self.metric,
|
||||
missing_values=self.missing_values,
|
||||
force_all_finite=force_all_finite,
|
||||
reduce_func=process_chunk,
|
||||
)
|
||||
for chunk in gen:
|
||||
# process_chunk modifies X in place. No return value.
|
||||
pass
|
||||
|
||||
if self.keep_empty_features:
|
||||
Xc = X
|
||||
Xc[:, ~valid_mask] = 0
|
||||
else:
|
||||
Xc = X[:, valid_mask]
|
||||
|
||||
return super()._concatenate_indicator(Xc, X_indicator)
|
||||
|
||||
def get_feature_names_out(self, input_features=None):
|
||||
"""Get output feature names for transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_features : array-like of str or None, default=None
|
||||
Input features.
|
||||
|
||||
- If `input_features` is `None`, then `feature_names_in_` is
|
||||
used as feature names in. If `feature_names_in_` is not defined,
|
||||
then the following input feature names are generated:
|
||||
`["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
|
||||
- If `input_features` is an array-like, then `input_features` must
|
||||
match `feature_names_in_` if `feature_names_in_` is defined.
|
||||
|
||||
Returns
|
||||
-------
|
||||
feature_names_out : ndarray of str objects
|
||||
Transformed feature names.
|
||||
"""
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
input_features = _check_feature_names_in(self, input_features)
|
||||
names = input_features[self._valid_mask]
|
||||
return self._concatenate_indicator_feature_names_out(names, input_features)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,107 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.impute._base import _BaseImputer
|
||||
from sklearn.impute._iterative import _assign_where
|
||||
from sklearn.utils._mask import _get_mask
|
||||
from sklearn.utils._testing import _convert_container, assert_allclose
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def data():
|
||||
X = np.random.randn(10, 2)
|
||||
X[::2] = np.nan
|
||||
return X
|
||||
|
||||
|
||||
class NoFitIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
class NoTransformIndicatorImputer(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
mask = _get_mask(X, value_to_mask=np.nan)
|
||||
super()._fit_indicator(mask)
|
||||
return self
|
||||
|
||||
def transform(self, X, y=None):
|
||||
return self._concatenate_indicator(X, None)
|
||||
|
||||
|
||||
class NoPrecomputedMaskFit(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
self._fit_indicator(X)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
class NoPrecomputedMaskTransform(_BaseImputer):
|
||||
def fit(self, X, y=None):
|
||||
mask = _get_mask(X, value_to_mask=np.nan)
|
||||
self._fit_indicator(mask)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
return self._concatenate_indicator(X, self._transform_indicator(X))
|
||||
|
||||
|
||||
def test_base_imputer_not_fit(data):
|
||||
imputer = NoFitIndicatorImputer(add_indicator=True)
|
||||
err_msg = "Make sure to call _fit_indicator before _transform_indicator"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_imputer_not_transform(data):
|
||||
imputer = NoTransformIndicatorImputer(add_indicator=True)
|
||||
err_msg = (
|
||||
"Call _fit_indicator and _transform_indicator in the imputer implementation"
|
||||
)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data).transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_no_precomputed_mask_fit(data):
|
||||
imputer = NoPrecomputedMaskFit(add_indicator=True)
|
||||
err_msg = "precomputed is True but the input data is not a mask"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
def test_base_no_precomputed_mask_transform(data):
|
||||
imputer = NoPrecomputedMaskTransform(add_indicator=True)
|
||||
err_msg = "precomputed is True but the input data is not a mask"
|
||||
imputer.fit(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.transform(data)
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
imputer.fit_transform(data)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("X1_type", ["array", "dataframe"])
|
||||
def test_assign_where(X1_type):
|
||||
"""Check the behaviour of the private helpers `_assign_where`."""
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
n_samples, n_features = 10, 5
|
||||
X1 = _convert_container(rng.randn(n_samples, n_features), constructor_name=X1_type)
|
||||
X2 = rng.randn(n_samples, n_features)
|
||||
mask = rng.randint(0, 2, size=(n_samples, n_features)).astype(bool)
|
||||
|
||||
_assign_where(X1, X2, mask)
|
||||
|
||||
if X1_type == "dataframe":
|
||||
X1 = X1.to_numpy()
|
||||
assert_allclose(X1[mask], X2[mask])
|
||||
@@ -0,0 +1,220 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.experimental import enable_iterative_imputer # noqa
|
||||
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
|
||||
def imputers():
|
||||
return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
|
||||
|
||||
|
||||
def sparse_imputers():
|
||||
return [SimpleImputer()]
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_imputation_missing_value_in_test_array(imputer):
|
||||
# [Non Regression Test for issue #13968] Missing value in test set should
|
||||
# not throw an error and return a finite dataset
|
||||
train = [[1], [2]]
|
||||
test = [[3], [np.nan]]
|
||||
imputer.set_params(add_indicator=True)
|
||||
imputer.fit(train).transform(test)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1, 0])
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_imputers_add_indicator(marker, imputer):
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
X_true_indicator = np.array(
|
||||
[
|
||||
[1.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 1.0, 0.0, 1.0],
|
||||
[0.0, 0.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0],
|
||||
]
|
||||
)
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("marker", [np.nan, -1])
|
||||
@pytest.mark.parametrize(
|
||||
"imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__
|
||||
)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_imputers_add_indicator_sparse(imputer, marker, csr_container):
|
||||
X = csr_container(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
X_true_indicator = csr_container(
|
||||
[
|
||||
[1.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 1.0, 0.0, 1.0],
|
||||
[0.0, 0.0, 1.0, 1.0],
|
||||
[0.0, 0.0, 0.0, 1.0],
|
||||
]
|
||||
)
|
||||
imputer.set_params(missing_values=marker, add_indicator=True)
|
||||
|
||||
X_trans = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
|
||||
assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
X_trans_no_indicator = imputer.fit_transform(X)
|
||||
assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
|
||||
|
||||
|
||||
# ConvergenceWarning will be raised by the IterativeImputer
|
||||
@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("add_indicator", [True, False])
|
||||
def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
|
||||
# Test pandas IntegerArray with pd.NA
|
||||
pd = pytest.importorskip("pandas")
|
||||
marker = np.nan
|
||||
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, marker, 1],
|
||||
[2, marker, 1, marker, 2],
|
||||
[6, 3, marker, marker, 3],
|
||||
[1, 2, 9, marker, 4],
|
||||
]
|
||||
)
|
||||
# fit on numpy array
|
||||
X_trans_expected = imputer.fit_transform(X)
|
||||
|
||||
# Creates dataframe with IntegerArrays with pd.NA
|
||||
X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
|
||||
|
||||
# fit on pandas dataframe with IntegerArrays
|
||||
X_trans = imputer.fit_transform(X_df)
|
||||
|
||||
assert_allclose(X_trans_expected, X_trans)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("add_indicator", [True, False])
|
||||
def test_imputers_feature_names_out_pandas(imputer, add_indicator):
|
||||
"""Check feature names out for imputers."""
|
||||
pd = pytest.importorskip("pandas")
|
||||
marker = np.nan
|
||||
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[marker, 1, 5, 3, marker, 1],
|
||||
[2, marker, 1, 4, marker, 2],
|
||||
[6, 3, 7, marker, marker, 3],
|
||||
[1, 2, 9, 8, marker, 4],
|
||||
]
|
||||
)
|
||||
X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
|
||||
imputer.fit(X_df)
|
||||
|
||||
names = imputer.get_feature_names_out()
|
||||
|
||||
if add_indicator:
|
||||
expected_names = [
|
||||
"a",
|
||||
"b",
|
||||
"c",
|
||||
"d",
|
||||
"f",
|
||||
"missingindicator_a",
|
||||
"missingindicator_b",
|
||||
"missingindicator_d",
|
||||
"missingindicator_e",
|
||||
]
|
||||
assert_array_equal(expected_names, names)
|
||||
else:
|
||||
expected_names = ["a", "b", "c", "d", "f"]
|
||||
assert_array_equal(expected_names, names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keep_empty_features", [True, False])
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
def test_keep_empty_features(imputer, keep_empty_features):
|
||||
"""Check that the imputer keeps features with only missing values."""
|
||||
X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]])
|
||||
imputer = imputer.set_params(
|
||||
add_indicator=False, keep_empty_features=keep_empty_features
|
||||
)
|
||||
|
||||
for method in ["fit_transform", "transform"]:
|
||||
X_imputed = getattr(imputer, method)(X)
|
||||
if keep_empty_features:
|
||||
assert X_imputed.shape == X.shape
|
||||
else:
|
||||
assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
|
||||
@pytest.mark.parametrize("missing_value_test", [np.nan, 1])
|
||||
def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
|
||||
imputer, missing_value_test
|
||||
):
|
||||
"""Check that missing indicator always exists when add_indicator=True.
|
||||
|
||||
Non-regression test for gh-26590.
|
||||
"""
|
||||
X_train = np.array([[0, np.nan], [1, 2]])
|
||||
|
||||
# Test data where missing_value_test variable can be set to np.nan or 1.
|
||||
X_test = np.array([[0, missing_value_test], [1, 2]])
|
||||
|
||||
imputer.set_params(add_indicator=True)
|
||||
imputer.fit(X_train)
|
||||
|
||||
X_test_imputed_with_indicator = imputer.transform(X_test)
|
||||
assert X_test_imputed_with_indicator.shape == (2, 3)
|
||||
|
||||
imputer.set_params(add_indicator=False)
|
||||
imputer.fit(X_train)
|
||||
X_test_imputed_without_indicator = imputer.transform(X_test)
|
||||
assert X_test_imputed_without_indicator.shape == (2, 2)
|
||||
|
||||
assert_allclose(
|
||||
X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
|
||||
)
|
||||
if np.isnan(missing_value_test):
|
||||
expected_missing_indicator = [1, 0]
|
||||
else:
|
||||
expected_missing_indicator = [0, 0]
|
||||
|
||||
assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,547 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn import config_context
|
||||
from sklearn.impute import KNNImputer
|
||||
from sklearn.metrics.pairwise import nan_euclidean_distances, pairwise_distances
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.utils._testing import assert_allclose
|
||||
|
||||
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
@pytest.mark.parametrize("n_neighbors", range(1, 6))
|
||||
def test_knn_imputer_shape(weights, n_neighbors):
|
||||
# Verify the shapes of the imputed matrix for different weights and
|
||||
# number of neighbors.
|
||||
n_rows = 10
|
||||
n_cols = 2
|
||||
X = np.random.rand(n_rows, n_cols)
|
||||
X[0, 0] = np.nan
|
||||
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
|
||||
X_imputed = imputer.fit_transform(X)
|
||||
assert X_imputed.shape == (n_rows, n_cols)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_default_with_invalid_input(na):
|
||||
# Test imputation with default values and invalid input
|
||||
|
||||
# Test with inf present
|
||||
X = np.array(
|
||||
[
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
|
||||
KNNImputer(missing_values=na).fit(X)
|
||||
|
||||
# Test with inf present in matrix passed in transform()
|
||||
X = np.array(
|
||||
[
|
||||
[np.inf, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
|
||||
X_fit = np.array(
|
||||
[
|
||||
[0, 1, 1, 2, na],
|
||||
[2, 1, 2, 2, 3],
|
||||
[3, 2, 3, 3, 8],
|
||||
[na, 6, 0, 5, 13],
|
||||
[na, 7, 0, 7, 8],
|
||||
[6, 6, 2, 5, 7],
|
||||
]
|
||||
)
|
||||
imputer = KNNImputer(missing_values=na).fit(X_fit)
|
||||
with pytest.raises(ValueError, match="Input X contains (infinity|NaN)"):
|
||||
imputer.transform(X)
|
||||
|
||||
# Test with missing_values=0 when NaN present
|
||||
imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
|
||||
X = np.array(
|
||||
[
|
||||
[np.nan, 0, 0, 0, 5],
|
||||
[np.nan, 1, 0, np.nan, 3],
|
||||
[np.nan, 2, 0, 0, 0],
|
||||
[np.nan, 6, 0, 5, 13],
|
||||
]
|
||||
)
|
||||
msg = "Input X contains NaN"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
imputer.fit(X)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[0, 0],
|
||||
[np.nan, 2],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_removes_all_na_features(na):
|
||||
X = np.array(
|
||||
[
|
||||
[1, 1, na, 1, 1, 1.0],
|
||||
[2, 3, na, 2, 2, 2],
|
||||
[3, 4, na, 3, 3, na],
|
||||
[6, 4, na, na, 6, 6],
|
||||
]
|
||||
)
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=2).fit(X)
|
||||
|
||||
X_transform = knn.transform(X)
|
||||
assert not np.isnan(X_transform).any()
|
||||
assert X_transform.shape == (4, 5)
|
||||
|
||||
X_test = np.arange(0, 12).reshape(2, 6)
|
||||
X_transform = knn.transform(X_test)
|
||||
assert_allclose(X_test[:, [0, 1, 3, 4, 5]], X_transform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_zero_nan_imputes_the_same(na):
|
||||
# Test with an imputable matrix and compare with different missing_values
|
||||
X_zero = np.array(
|
||||
[
|
||||
[1, 0, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 0],
|
||||
[6, 6, 0, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
X_nan = np.array(
|
||||
[
|
||||
[1, na, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, na],
|
||||
[6, 6, na, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 2.5, 1, 1, 1.0],
|
||||
[2, 2, 2, 2, 2],
|
||||
[3, 3, 3, 3, 1.5],
|
||||
[6, 6, 2.5, 6, 6],
|
||||
]
|
||||
)
|
||||
|
||||
imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform")
|
||||
|
||||
imputer_nan = KNNImputer(missing_values=na, n_neighbors=2, weights="uniform")
|
||||
|
||||
assert_allclose(imputer_zero.fit_transform(X_zero), X_imputed)
|
||||
assert_allclose(
|
||||
imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_verify(na):
|
||||
# Test with an imputable matrix
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[na, 4, 5, 5],
|
||||
[6, na, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
]
|
||||
)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[2, 1, 2, 8],
|
||||
[3, 2, 3, 8],
|
||||
[4, 4, 5, 5],
|
||||
[6, 3, 6, 7],
|
||||
[8, 8, 8, 8],
|
||||
[16, 15, 18, 19],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when there is not enough neighbors
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, na],
|
||||
[2, 1, 2, na],
|
||||
[3, 2, 3, na],
|
||||
[4, 4, 5, na],
|
||||
[6, 7, 6, na],
|
||||
[8, 8, 8, na],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22],
|
||||
]
|
||||
)
|
||||
|
||||
# Not enough neighbors, use column mean from training
|
||||
X_impute_value = (20 + 22) / 2
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, X_impute_value],
|
||||
[2, 1, 2, X_impute_value],
|
||||
[3, 2, 3, X_impute_value],
|
||||
[4, 4, 5, X_impute_value],
|
||||
[6, 7, 6, X_impute_value],
|
||||
[8, 8, 8, X_impute_value],
|
||||
[20, 20, 20, 20],
|
||||
[22, 22, 22, 22],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test when data in fit() and transform() are different
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16]])
|
||||
|
||||
X1 = np.array([[1, 0], [3, 2], [4, na]])
|
||||
|
||||
X_2_1 = (0 + 3 + 6 + 7 + 8) / 5
|
||||
X1_imputed = np.array([[1, 0], [3, 2], [4, X_2_1]])
|
||||
|
||||
imputer = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer.fit(X).transform(X1), X1_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_one_n_neighbors(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
|
||||
|
||||
X_imputed = np.array([[0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13]])
|
||||
|
||||
imputer = KNNImputer(n_neighbors=1, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_all_samples_are_neighbors(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
|
||||
|
||||
X_imputed = np.array([[0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13]])
|
||||
|
||||
n_neighbors = X.shape[0] - 1
|
||||
imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
n_neighbors = X.shape[0]
|
||||
imputer_plus1 = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
|
||||
assert_allclose(imputer_plus1.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_uniform(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
|
||||
|
||||
# Test with "uniform" weight (or unweighted)
|
||||
X_imputed_uniform = np.array(
|
||||
[[0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="uniform", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" weight
|
||||
def no_weight(dist):
|
||||
return None
|
||||
|
||||
imputer = KNNImputer(weights=no_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
# Test with "callable" uniform weight
|
||||
def uniform_weight(dist):
|
||||
return np.ones_like(dist)
|
||||
|
||||
imputer = KNNImputer(weights=uniform_weight, missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_uniform)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [np.nan, -1])
|
||||
def test_knn_imputer_weight_distance(na):
|
||||
X = np.array([[0, 0], [na, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]])
|
||||
|
||||
# Test with "distance" weight
|
||||
nn = KNeighborsRegressor(metric="euclidean", weights="distance")
|
||||
X_rows_idx = [0, 2, 3, 4, 5, 6]
|
||||
nn.fit(X[X_rows_idx, 1:], X[X_rows_idx, 0])
|
||||
knn_imputed_value = nn.predict(X[1:2, 1:])[0]
|
||||
|
||||
# Manual calculation
|
||||
X_neighbors_idx = [0, 2, 3, 4, 5]
|
||||
dist = nan_euclidean_distances(X[1:2, :], X, missing_values=na)
|
||||
weights = 1 / dist[:, X_neighbors_idx].ravel()
|
||||
manual_imputed_value = np.average(X[X_neighbors_idx, 0], weights=weights)
|
||||
|
||||
X_imputed_distance1 = np.array(
|
||||
[[0, 0], [manual_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
# NearestNeighbor calculation
|
||||
X_imputed_distance2 = np.array(
|
||||
[[0, 0], [knn_imputed_value, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance1)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed_distance2)
|
||||
|
||||
# Test with weights = "distance" and n_neighbors=2
|
||||
X = np.array(
|
||||
[
|
||||
[na, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
]
|
||||
)
|
||||
|
||||
# neighbors are rows 1, 2, the nan_euclidean_distances are:
|
||||
dist_0_1 = np.sqrt((3 / 2) * ((1 - 0) ** 2 + (2 - 0) ** 2))
|
||||
dist_0_2 = np.sqrt((3 / 2) * ((2 - 0) ** 2 + (3 - 0) ** 2))
|
||||
imputed_value = np.average([2, 3], weights=[1 / dist_0_1, 1 / dist_0_2])
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[imputed_value, 0, 0],
|
||||
[2, 1, 2],
|
||||
[3, 2, 3],
|
||||
[4, 5, 5],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
# Test with varying missingness patterns
|
||||
X = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[0, na, 1, na],
|
||||
[1, 1, 1, na],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
]
|
||||
)
|
||||
|
||||
# Get weights of donor neighbors
|
||||
dist = nan_euclidean_distances(X, missing_values=na)
|
||||
r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]]
|
||||
r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]]
|
||||
r1c1_nbor_wt = 1 / r1c1_nbor_dists
|
||||
r1c3_nbor_wt = 1 / r1c3_nbor_dists
|
||||
|
||||
r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]]
|
||||
r2c3_nbor_wt = 1 / r2c3_nbor_dists
|
||||
|
||||
# Collect donor values
|
||||
col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy()
|
||||
col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy()
|
||||
|
||||
# Final imputed values
|
||||
r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt)
|
||||
r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt)
|
||||
r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[1, 0, 0, 1],
|
||||
[0, r1c1_imp, 1, r1c3_imp],
|
||||
[1, 1, 1, r2c3_imp],
|
||||
[0, 1, 0, 0],
|
||||
[0, 0, 0, 0],
|
||||
[1, 0, 1, 1],
|
||||
[10, 10, 10, 10],
|
||||
]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(weights="distance", missing_values=na)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
X = np.array(
|
||||
[
|
||||
[0, 0, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
dist = pairwise_distances(
|
||||
X, metric="nan_euclidean", squared=False, missing_values=na
|
||||
)
|
||||
|
||||
# Calculate weights
|
||||
r0c3_w = 1.0 / dist[0, 2:-1]
|
||||
r1c3_w = 1.0 / dist[1, 2:-1]
|
||||
r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)]
|
||||
r7c0_w = 1.0 / dist[7, 2:7]
|
||||
|
||||
# Calculate weighted averages
|
||||
r0c3 = np.average(X[2:-1, -1], weights=r0c3_w)
|
||||
r1c3 = np.average(X[2:-1, -1], weights=r1c3_w)
|
||||
r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w)
|
||||
r7c0 = np.average(X[2:7, 0], weights=r7c0_w)
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[0, 0, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
imputer_comp_wt = KNNImputer(missing_values=na, weights="distance")
|
||||
assert_allclose(imputer_comp_wt.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
def test_knn_imputer_callable_metric():
|
||||
# Define callable metric that returns the l1 norm:
|
||||
def custom_callable(x, y, missing_values=np.nan, squared=False):
|
||||
x = np.ma.array(x, mask=np.isnan(x))
|
||||
y = np.ma.array(y, mask=np.isnan(y))
|
||||
dist = np.nansum(np.abs(x - y))
|
||||
return dist
|
||||
|
||||
X = np.array([[4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.0]])
|
||||
|
||||
X_0_3 = (9 + 9) / 2
|
||||
X_3_0 = (6 + 4) / 2
|
||||
X_imputed = np.array(
|
||||
[[4, 3, 3, X_0_3], [6, 9, 6, 9], [4, 8, 6, 9], [X_3_0, 9, 11, 10.0]]
|
||||
)
|
||||
|
||||
imputer = KNNImputer(n_neighbors=2, metric=custom_callable)
|
||||
assert_allclose(imputer.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
# Note that we use working_memory=0 to ensure that chunking is tested, even
|
||||
# for a small dataset. However, it should raise a UserWarning that we ignore.
|
||||
@pytest.mark.filterwarnings("ignore:adhere to working_memory")
|
||||
def test_knn_imputer_with_simple_example(na, working_memory):
|
||||
X = np.array(
|
||||
[
|
||||
[0, na, 0, na],
|
||||
[1, 1, 1, na],
|
||||
[2, 2, na, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[na, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
r0c1 = np.mean(X[1:6, 1])
|
||||
r0c3 = np.mean(X[2:-1, -1])
|
||||
r1c3 = np.mean(X[2:-1, -1])
|
||||
r2c2 = np.mean(X[[0, 1, 3, 4, 5], 2])
|
||||
r7c0 = np.mean(X[2:-1, 0])
|
||||
|
||||
X_imputed = np.array(
|
||||
[
|
||||
[0, r0c1, 0, r0c3],
|
||||
[1, 1, 1, r1c3],
|
||||
[2, 2, r2c2, 2],
|
||||
[3, 3, 3, 3],
|
||||
[4, 4, 4, 4],
|
||||
[5, 5, 5, 5],
|
||||
[6, 6, 6, 6],
|
||||
[r7c0, 7, 7, 7],
|
||||
]
|
||||
)
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
imputer_comp = KNNImputer(missing_values=na)
|
||||
assert_allclose(imputer_comp.fit_transform(X), X_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
@pytest.mark.parametrize("weights", ["uniform", "distance"])
|
||||
def test_knn_imputer_not_enough_valid_distances(na, weights):
|
||||
# Samples with needed feature has nan distance
|
||||
X1 = np.array([[na, 11], [na, 1], [3, na]])
|
||||
X1_imputed = np.array([[3, 11], [3, 1], [3, 6]])
|
||||
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1, weights=weights)
|
||||
assert_allclose(knn.fit_transform(X1), X1_imputed)
|
||||
|
||||
X2 = np.array([[4, na]])
|
||||
X2_imputed = np.array([[4, 6]])
|
||||
assert_allclose(knn.transform(X2), X2_imputed)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_drops_all_nan_features(na):
|
||||
X1 = np.array([[na, 1], [na, 2]])
|
||||
knn = KNNImputer(missing_values=na, n_neighbors=1)
|
||||
X1_expected = np.array([[1], [2]])
|
||||
assert_allclose(knn.fit_transform(X1), X1_expected)
|
||||
|
||||
X2 = np.array([[1, 2], [3, na]])
|
||||
X2_expected = np.array([[2], [1.5]])
|
||||
assert_allclose(knn.transform(X2), X2_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("working_memory", [None, 0])
|
||||
@pytest.mark.parametrize("na", [-1, np.nan])
|
||||
def test_knn_imputer_distance_weighted_not_enough_neighbors(na, working_memory):
|
||||
X = np.array([[3, na], [2, na], [na, 4], [5, 6], [6, 8], [na, 5]])
|
||||
|
||||
dist = pairwise_distances(
|
||||
X, metric="nan_euclidean", squared=False, missing_values=na
|
||||
)
|
||||
|
||||
X_01 = np.average(X[3:5, 1], weights=1 / dist[0, 3:5])
|
||||
X_11 = np.average(X[3:5, 1], weights=1 / dist[1, 3:5])
|
||||
X_20 = np.average(X[3:5, 0], weights=1 / dist[2, 3:5])
|
||||
X_50 = np.average(X[3:5, 0], weights=1 / dist[5, 3:5])
|
||||
|
||||
X_expected = np.array([[3, X_01], [2, X_11], [X_20, 4], [5, 6], [6, 8], [X_50, 5]])
|
||||
|
||||
with config_context(working_memory=working_memory):
|
||||
knn_3 = KNNImputer(missing_values=na, n_neighbors=3, weights="distance")
|
||||
assert_allclose(knn_3.fit_transform(X), X_expected)
|
||||
|
||||
knn_4 = KNNImputer(missing_values=na, n_neighbors=4, weights="distance")
|
||||
assert_allclose(knn_4.fit_transform(X), X_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na, allow_nan", [(-1, False), (np.nan, True)])
|
||||
def test_knn_tags(na, allow_nan):
|
||||
knn = KNNImputer(missing_values=na)
|
||||
assert knn._get_tags()["allow_nan"] == allow_nan
|
||||
Reference in New Issue
Block a user