library packages
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
"""Data embedding techniques."""
|
||||
|
||||
from ._isomap import Isomap
|
||||
from ._locally_linear import LocallyLinearEmbedding, locally_linear_embedding
|
||||
from ._mds import MDS, smacof
|
||||
from ._spectral_embedding import SpectralEmbedding, spectral_embedding
|
||||
from ._t_sne import TSNE, trustworthiness
|
||||
|
||||
__all__ = [
|
||||
"locally_linear_embedding",
|
||||
"LocallyLinearEmbedding",
|
||||
"Isomap",
|
||||
"MDS",
|
||||
"smacof",
|
||||
"SpectralEmbedding",
|
||||
"spectral_embedding",
|
||||
"TSNE",
|
||||
"trustworthiness",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,295 @@
|
||||
# Author: Christopher Moody <chrisemoody@gmail.com>
|
||||
# Author: Nick Travers <nickt@squareup.com>
|
||||
# Implementation by Chris Moody & Nick Travers
|
||||
# See http://homepage.tudelft.nl/19j49/t-SNE.html for reference
|
||||
# implementations and papers describing the technique
|
||||
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
from libc.stdio cimport printf
|
||||
from libc.math cimport log
|
||||
from libc.stdlib cimport malloc, free
|
||||
from libc.time cimport clock, clock_t
|
||||
from cython.parallel cimport prange, parallel
|
||||
|
||||
from ..neighbors._quad_tree cimport _QuadTree
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
cdef char* EMPTY_STRING = ""
|
||||
|
||||
# Smallest strictly positive value that can be represented by floating
|
||||
# point numbers for different precision levels. This is useful to avoid
|
||||
# taking the log of zero when computing the KL divergence.
|
||||
cdef float FLOAT32_TINY = np.finfo(np.float32).tiny
|
||||
|
||||
# Useful to void division by zero or divergence to +inf.
|
||||
cdef float FLOAT64_EPS = np.finfo(np.float64).eps
|
||||
|
||||
# This is effectively an ifdef statement in Cython
|
||||
# It allows us to write printf debugging lines
|
||||
# and remove them at compile time
|
||||
cdef enum:
|
||||
DEBUGFLAG = 0
|
||||
|
||||
cdef float compute_gradient(float[:] val_P,
|
||||
float[:, :] pos_reference,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float[:, :] tot_force,
|
||||
_QuadTree qt,
|
||||
float theta,
|
||||
int dof,
|
||||
long start,
|
||||
bint compute_error,
|
||||
int num_threads) noexcept nogil:
|
||||
# Having created the tree, calculate the gradient
|
||||
# in two components, the positive and negative forces
|
||||
cdef:
|
||||
long i, coord
|
||||
int ax
|
||||
long n_samples = pos_reference.shape[0]
|
||||
int n_dimensions = qt.n_dimensions
|
||||
clock_t t1 = 0, t2 = 0
|
||||
double sQ
|
||||
float error
|
||||
int take_timing = 1 if qt.verbose > 15 else 0
|
||||
|
||||
if qt.verbose > 11:
|
||||
printf("[t-SNE] Allocating %li elements in force arrays\n",
|
||||
n_samples * n_dimensions * 2)
|
||||
cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
|
||||
cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
|
||||
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
|
||||
num_threads)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
|
||||
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
|
||||
pos_f, n_dimensions, dof, sQ, start,
|
||||
qt.verbose, compute_error, num_threads)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
printf("[t-SNE] Computing positive gradient: %e ticks\n",
|
||||
((float) (t2 - t1)))
|
||||
for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
|
||||
schedule='static'):
|
||||
for ax in range(n_dimensions):
|
||||
coord = i * n_dimensions + ax
|
||||
tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
|
||||
|
||||
free(neg_f)
|
||||
free(pos_f)
|
||||
return error
|
||||
|
||||
|
||||
cdef float compute_gradient_positive(float[:] val_P,
|
||||
float[:, :] pos_reference,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float* pos_f,
|
||||
int n_dimensions,
|
||||
int dof,
|
||||
double sum_Q,
|
||||
cnp.int64_t start,
|
||||
int verbose,
|
||||
bint compute_error,
|
||||
int num_threads) noexcept nogil:
|
||||
# Sum over the following expression for i not equal to j
|
||||
# grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
|
||||
# This is equivalent to compute_edge_forces in the authors' code
|
||||
# It just goes over the nearest neighbors instead of all the data points
|
||||
# (unlike the non-nearest neighbors version of `compute_gradient_positive')
|
||||
cdef:
|
||||
int ax
|
||||
long i, j, k
|
||||
long n_samples = indptr.shape[0] - 1
|
||||
float C = 0.0
|
||||
float dij, qij, pij
|
||||
float exponent = (dof + 1.0) / 2.0
|
||||
float float_dof = (float) (dof)
|
||||
float* buff
|
||||
clock_t t1 = 0, t2 = 0
|
||||
float dt
|
||||
|
||||
if verbose > 10:
|
||||
t1 = clock()
|
||||
|
||||
with nogil, parallel(num_threads=num_threads):
|
||||
# Define private buffer variables
|
||||
buff = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
|
||||
for i in prange(start, n_samples, schedule='static'):
|
||||
# Init the gradient vector
|
||||
for ax in range(n_dimensions):
|
||||
pos_f[i * n_dimensions + ax] = 0.0
|
||||
# Compute the positive interaction for the nearest neighbors
|
||||
for k in range(indptr[i], indptr[i+1]):
|
||||
j = neighbors[k]
|
||||
dij = 0.0
|
||||
pij = val_P[k]
|
||||
for ax in range(n_dimensions):
|
||||
buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
|
||||
dij += buff[ax] * buff[ax]
|
||||
qij = float_dof / (float_dof + dij)
|
||||
if dof != 1: # i.e. exponent != 1
|
||||
qij = qij ** exponent
|
||||
dij = pij * qij
|
||||
|
||||
# only compute the error when needed
|
||||
if compute_error:
|
||||
qij = qij / sum_Q
|
||||
C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
|
||||
for ax in range(n_dimensions):
|
||||
pos_f[i * n_dimensions + ax] += dij * buff[ax]
|
||||
|
||||
free(buff)
|
||||
if verbose > 10:
|
||||
t2 = clock()
|
||||
dt = ((float) (t2 - t1))
|
||||
printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
|
||||
return C
|
||||
|
||||
|
||||
cdef double compute_gradient_negative(float[:, :] pos_reference,
|
||||
float* neg_f,
|
||||
_QuadTree qt,
|
||||
int dof,
|
||||
float theta,
|
||||
long start,
|
||||
int num_threads) noexcept nogil:
|
||||
cdef:
|
||||
int ax
|
||||
int n_dimensions = qt.n_dimensions
|
||||
int offset = n_dimensions + 2
|
||||
long i, j, idx
|
||||
long n_samples = pos_reference.shape[0]
|
||||
long n = n_samples - start
|
||||
long dta = 0
|
||||
long dtb = 0
|
||||
float size, dist2s, mult
|
||||
float exponent = (dof + 1.0) / 2.0
|
||||
float float_dof = (float) (dof)
|
||||
double qijZ, sum_Q = 0.0
|
||||
float* force
|
||||
float* neg_force
|
||||
float* pos
|
||||
clock_t t1 = 0, t2 = 0, t3 = 0
|
||||
int take_timing = 1 if qt.verbose > 20 else 0
|
||||
|
||||
with nogil, parallel(num_threads=num_threads):
|
||||
# Define thread-local buffers
|
||||
summary = <float*> malloc(sizeof(float) * n * offset)
|
||||
pos = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
force = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
neg_force = <float *> malloc(sizeof(float) * n_dimensions)
|
||||
|
||||
for i in prange(start, n_samples, schedule='static'):
|
||||
# Clear the arrays
|
||||
for ax in range(n_dimensions):
|
||||
force[ax] = 0.0
|
||||
neg_force[ax] = 0.0
|
||||
pos[ax] = pos_reference[i, ax]
|
||||
|
||||
# Find which nodes are summarizing and collect their centers of mass
|
||||
# deltas, and sizes, into vectorized arrays
|
||||
if take_timing:
|
||||
t1 = clock()
|
||||
idx = qt.summarize(pos, summary, theta*theta)
|
||||
if take_timing:
|
||||
t2 = clock()
|
||||
# Compute the t-SNE negative force
|
||||
# for the digits dataset, walking the tree
|
||||
# is about 10-15x more expensive than the
|
||||
# following for loop
|
||||
for j in range(idx // offset):
|
||||
|
||||
dist2s = summary[j * offset + n_dimensions]
|
||||
size = summary[j * offset + n_dimensions + 1]
|
||||
qijZ = float_dof / (float_dof + dist2s) # 1/(1+dist)
|
||||
if dof != 1: # i.e. exponent != 1
|
||||
qijZ = qijZ ** exponent
|
||||
|
||||
sum_Q += size * qijZ # size of the node * q
|
||||
mult = size * qijZ * qijZ
|
||||
for ax in range(n_dimensions):
|
||||
neg_force[ax] += mult * summary[j * offset + ax]
|
||||
if take_timing:
|
||||
t3 = clock()
|
||||
for ax in range(n_dimensions):
|
||||
neg_f[i * n_dimensions + ax] = neg_force[ax]
|
||||
if take_timing:
|
||||
dta += t2 - t1
|
||||
dtb += t3 - t2
|
||||
free(pos)
|
||||
free(force)
|
||||
free(neg_force)
|
||||
free(summary)
|
||||
if take_timing:
|
||||
printf("[t-SNE] Tree: %li clock ticks | ", dta)
|
||||
printf("Force computation: %li clock ticks\n", dtb)
|
||||
|
||||
# Put sum_Q to machine EPSILON to avoid divisions by 0
|
||||
sum_Q = max(sum_Q, FLOAT64_EPS)
|
||||
return sum_Q
|
||||
|
||||
|
||||
def gradient(float[:] val_P,
|
||||
float[:, :] pos_output,
|
||||
cnp.int64_t[:] neighbors,
|
||||
cnp.int64_t[:] indptr,
|
||||
float[:, :] forces,
|
||||
float theta,
|
||||
int n_dimensions,
|
||||
int verbose,
|
||||
int dof=1,
|
||||
long skip_num_points=0,
|
||||
bint compute_error=1,
|
||||
int num_threads=1):
|
||||
# This function is designed to be called from external Python
|
||||
# it passes the 'forces' array by reference and fills that's array
|
||||
# up in-place
|
||||
cdef float C
|
||||
cdef int n
|
||||
n = pos_output.shape[0]
|
||||
assert val_P.itemsize == 4
|
||||
assert pos_output.itemsize == 4
|
||||
assert forces.itemsize == 4
|
||||
m = "Forces array and pos_output shapes are incompatible"
|
||||
assert n == forces.shape[0], m
|
||||
m = "Pij and pos_output shapes are incompatible"
|
||||
assert n == indptr.shape[0] - 1, m
|
||||
if verbose > 10:
|
||||
printf("[t-SNE] Initializing tree of n_dimensions %i\n", n_dimensions)
|
||||
cdef _QuadTree qt = _QuadTree(pos_output.shape[1], verbose)
|
||||
if verbose > 10:
|
||||
printf("[t-SNE] Inserting %li points\n", pos_output.shape[0])
|
||||
qt.build_tree(pos_output)
|
||||
if verbose > 10:
|
||||
# XXX: format hack to workaround lack of `const char *` type
|
||||
# in the generated C code that triggers error with gcc 4.9
|
||||
# and -Werror=format-security
|
||||
printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
|
||||
|
||||
C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
|
||||
qt, theta, dof, skip_num_points, compute_error,
|
||||
num_threads)
|
||||
|
||||
if verbose > 10:
|
||||
# XXX: format hack to workaround lack of `const char *` type
|
||||
# in the generated C code
|
||||
# and -Werror=format-security
|
||||
printf("[t-SNE] Checking tree consistency\n%s", EMPTY_STRING)
|
||||
m = "Tree consistency failed: unexpected number of points on the tree"
|
||||
assert qt.cells[0].cumulative_size == qt.n_points, m
|
||||
if not compute_error:
|
||||
C = np.nan
|
||||
return C
|
||||
438
.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py
Normal file
438
.venv/lib/python3.12/site-packages/sklearn/manifold/_isomap.py
Normal file
@@ -0,0 +1,438 @@
|
||||
"""Isomap for manifold learning"""
|
||||
|
||||
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
|
||||
# License: BSD 3 clause (C) 2011
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse.csgraph import connected_components, shortest_path
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from ..decomposition import KernelPCA
|
||||
from ..metrics.pairwise import _VALID_METRICS
|
||||
from ..neighbors import NearestNeighbors, kneighbors_graph, radius_neighbors_graph
|
||||
from ..preprocessing import KernelCenterer
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.graph import _fix_connected_components
|
||||
from ..utils.validation import check_is_fitted
|
||||
|
||||
|
||||
class Isomap(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
|
||||
"""Isomap Embedding.
|
||||
|
||||
Non-linear dimensionality reduction through Isometric Mapping
|
||||
|
||||
Read more in the :ref:`User Guide <isomap>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int or None, default=5
|
||||
Number of neighbors to consider for each point. If `n_neighbors` is an int,
|
||||
then `radius` must be `None`.
|
||||
|
||||
radius : float or None, default=None
|
||||
Limiting distance of neighbors to return. If `radius` is a float,
|
||||
then `n_neighbors` must be set to `None`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
'auto' : Attempt to choose the most efficient solver
|
||||
for the given problem.
|
||||
|
||||
'arpack' : Use Arnoldi decomposition to find the eigenvalues
|
||||
and eigenvectors.
|
||||
|
||||
'dense' : Use a direct solver (i.e. LAPACK)
|
||||
for the eigenvalue decomposition.
|
||||
|
||||
tol : float, default=0
|
||||
Convergence tolerance passed to arpack or lobpcg.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
max_iter : int, default=None
|
||||
Maximum number of iterations for the arpack solver.
|
||||
not used if eigen_solver == 'dense'.
|
||||
|
||||
path_method : {'auto', 'FW', 'D'}, default='auto'
|
||||
Method to use in finding shortest path.
|
||||
|
||||
'auto' : attempt to choose the best algorithm automatically.
|
||||
|
||||
'FW' : Floyd-Warshall algorithm.
|
||||
|
||||
'D' : Dijkstra's algorithm.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search,
|
||||
passed to neighbors.NearestNeighbors instance.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
metric : str, or callable, default="minkowski"
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`Glossary <sparse graph>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
p : float, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape (n_samples, n_components)
|
||||
Stores the embedding vectors.
|
||||
|
||||
kernel_pca_ : object
|
||||
:class:`~sklearn.decomposition.KernelPCA` object used to implement the
|
||||
embedding.
|
||||
|
||||
nbrs_ : sklearn.neighbors.NearestNeighbors instance
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
dist_matrix_ : array-like, shape (n_samples, n_samples)
|
||||
Stores the geodesic distance matrix of training data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
MDS : Manifold learning using multidimensional scaling.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
|
||||
framework for nonlinear dimensionality reduction. Science 290 (5500)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import Isomap
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = Isomap(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"radius": [Interval(Real, 0, None, closed="both"), None],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"path_method": [StrOptions({"auto", "FW", "D"})],
|
||||
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
|
||||
"n_jobs": [Integral, None],
|
||||
"p": [Interval(Real, 1, None, closed="left")],
|
||||
"metric": [StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable],
|
||||
"metric_params": [dict, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
radius=None,
|
||||
n_components=2,
|
||||
eigen_solver="auto",
|
||||
tol=0,
|
||||
max_iter=None,
|
||||
path_method="auto",
|
||||
neighbors_algorithm="auto",
|
||||
n_jobs=None,
|
||||
metric="minkowski",
|
||||
p=2,
|
||||
metric_params=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.radius = radius
|
||||
self.n_components = n_components
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.path_method = path_method
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
self.metric = metric
|
||||
self.p = p
|
||||
self.metric_params = metric_params
|
||||
|
||||
def _fit_transform(self, X):
|
||||
if self.n_neighbors is not None and self.radius is not None:
|
||||
raise ValueError(
|
||||
"Both n_neighbors and radius are provided. Use"
|
||||
f" Isomap(radius={self.radius}, n_neighbors=None) if intended to use"
|
||||
" radius-based neighbors"
|
||||
)
|
||||
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
radius=self.radius,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self.nbrs_.fit(X)
|
||||
self.n_features_in_ = self.nbrs_.n_features_in_
|
||||
if hasattr(self.nbrs_, "feature_names_in_"):
|
||||
self.feature_names_in_ = self.nbrs_.feature_names_in_
|
||||
|
||||
self.kernel_pca_ = KernelPCA(
|
||||
n_components=self.n_components,
|
||||
kernel="precomputed",
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
n_jobs=self.n_jobs,
|
||||
).set_output(transform="default")
|
||||
|
||||
if self.n_neighbors is not None:
|
||||
nbg = kneighbors_graph(
|
||||
self.nbrs_,
|
||||
self.n_neighbors,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
else:
|
||||
nbg = radius_neighbors_graph(
|
||||
self.nbrs_,
|
||||
radius=self.radius,
|
||||
metric=self.metric,
|
||||
p=self.p,
|
||||
metric_params=self.metric_params,
|
||||
mode="distance",
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
# Compute the number of connected components, and connect the different
|
||||
# components to be able to compute a shortest path between all pairs
|
||||
# of samples in the graph.
|
||||
# Similar fix to cluster._agglomerative._fix_connectivity.
|
||||
n_connected_components, labels = connected_components(nbg)
|
||||
if n_connected_components > 1:
|
||||
if self.metric == "precomputed" and issparse(X):
|
||||
raise RuntimeError(
|
||||
"The number of connected components of the neighbors graph"
|
||||
f" is {n_connected_components} > 1. The graph cannot be "
|
||||
"completed with metric='precomputed', and Isomap cannot be"
|
||||
"fitted. Increase the number of neighbors to avoid this "
|
||||
"issue, or precompute the full distance matrix instead "
|
||||
"of passing a sparse neighbors graph."
|
||||
)
|
||||
warnings.warn(
|
||||
(
|
||||
"The number of connected components of the neighbors graph "
|
||||
f"is {n_connected_components} > 1. Completing the graph to fit"
|
||||
" Isomap might be slow. Increase the number of neighbors to "
|
||||
"avoid this issue."
|
||||
),
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# use array validated by NearestNeighbors
|
||||
nbg = _fix_connected_components(
|
||||
X=self.nbrs_._fit_X,
|
||||
graph=nbg,
|
||||
n_connected_components=n_connected_components,
|
||||
component_labels=labels,
|
||||
mode="distance",
|
||||
metric=self.nbrs_.effective_metric_,
|
||||
**self.nbrs_.effective_metric_params_,
|
||||
)
|
||||
|
||||
self.dist_matrix_ = shortest_path(nbg, method=self.path_method, directed=False)
|
||||
|
||||
if self.nbrs_._fit_X.dtype == np.float32:
|
||||
self.dist_matrix_ = self.dist_matrix_.astype(
|
||||
self.nbrs_._fit_X.dtype, copy=False
|
||||
)
|
||||
|
||||
G = self.dist_matrix_**2
|
||||
G *= -0.5
|
||||
|
||||
self.embedding_ = self.kernel_pca_.fit_transform(G)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
def reconstruction_error(self):
|
||||
"""Compute the reconstruction error for the embedding.
|
||||
|
||||
Returns
|
||||
-------
|
||||
reconstruction_error : float
|
||||
Reconstruction error.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The cost function of an isomap embedding is
|
||||
|
||||
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
|
||||
|
||||
Where D is the matrix of distances for the input data X,
|
||||
D_fit is the matrix of distances for the output embedding X_fit,
|
||||
and K is the isomap kernel:
|
||||
|
||||
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
|
||||
"""
|
||||
G = -0.5 * self.dist_matrix_**2
|
||||
G_center = KernelCenterer().fit_transform(G)
|
||||
evals = self.kernel_pca_.eigenvalues_
|
||||
return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
|
||||
|
||||
@_fit_context(
|
||||
# Isomap.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array, sparse matrix, precomputed tree, or NearestNeighbors
|
||||
object.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of self.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
@_fit_context(
|
||||
# Isomap.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix, BallTree, KDTree}
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""Transform X.
|
||||
|
||||
This is implemented by linking the points X into the graph of geodesic
|
||||
distances of the training data. First the `n_neighbors` nearest
|
||||
neighbors of X are found in the training data, and from these the
|
||||
shortest geodesic distances from each point in X to each point in
|
||||
the training data are computed in order to construct the kernel.
|
||||
The embedding of X is the projection of this kernel onto the
|
||||
embedding vectors of the training set.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix}, shape (n_queries, n_features)
|
||||
If neighbors_algorithm='precomputed', X is assumed to be a
|
||||
distance matrix or a sparse graph of shape
|
||||
(n_queries, n_samples_fit).
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_queries, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
if self.n_neighbors is not None:
|
||||
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
|
||||
else:
|
||||
distances, indices = self.nbrs_.radius_neighbors(X, return_distance=True)
|
||||
|
||||
# Create the graph of shortest distances from X to
|
||||
# training data via the nearest neighbors of X.
|
||||
# This can be done as a single array operation, but it potentially
|
||||
# takes a lot of memory. To avoid that, use a loop:
|
||||
|
||||
n_samples_fit = self.nbrs_.n_samples_fit_
|
||||
n_queries = distances.shape[0]
|
||||
|
||||
if hasattr(X, "dtype") and X.dtype == np.float32:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
G_X = np.zeros((n_queries, n_samples_fit), dtype)
|
||||
for i in range(n_queries):
|
||||
G_X[i] = np.min(self.dist_matrix_[indices[i]] + distances[i][:, None], 0)
|
||||
|
||||
G_X **= 2
|
||||
G_X *= -0.5
|
||||
|
||||
return self.kernel_pca_.transform(G_X)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"preserves_dtype": [np.float64, np.float32]}
|
||||
@@ -0,0 +1,880 @@
|
||||
"""Locally Linear Embedding"""
|
||||
|
||||
# Author: Fabian Pedregosa -- <fabian.pedregosa@inria.fr>
|
||||
# Jake Vanderplas -- <vanderplas@astro.washington.edu>
|
||||
# License: BSD 3 clause (C) INRIA 2011
|
||||
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import eigh, qr, solve, svd
|
||||
from scipy.sparse import csr_matrix, eye
|
||||
from scipy.sparse.linalg import eigsh
|
||||
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
_UnstableArchMixin,
|
||||
)
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..utils import check_array, check_random_state
|
||||
from ..utils._arpack import _init_arpack_v0
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.extmath import stable_cumsum
|
||||
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
|
||||
|
||||
|
||||
def barycenter_weights(X, Y, indices, reg=1e-3):
|
||||
"""Compute barycenter weights of X from Y along the first axis
|
||||
|
||||
We estimate the weights to assign to each point in Y[indices] to recover
|
||||
the point X[i]. The barycenter weights sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape (n_samples, n_dim)
|
||||
|
||||
Y : array-like, shape (n_samples, n_dim)
|
||||
|
||||
indices : array-like, shape (n_samples, n_dim)
|
||||
Indices of the points in Y used to compute the barycenter
|
||||
|
||||
reg : float, default=1e-3
|
||||
Amount of regularization to add for the problem to be
|
||||
well-posed in the case of n_neighbors > n_dim
|
||||
|
||||
Returns
|
||||
-------
|
||||
B : array-like, shape (n_samples, n_neighbors)
|
||||
|
||||
Notes
|
||||
-----
|
||||
See developers note for more information.
|
||||
"""
|
||||
X = check_array(X, dtype=FLOAT_DTYPES)
|
||||
Y = check_array(Y, dtype=FLOAT_DTYPES)
|
||||
indices = check_array(indices, dtype=int)
|
||||
|
||||
n_samples, n_neighbors = indices.shape
|
||||
assert X.shape[0] == n_samples
|
||||
|
||||
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
|
||||
v = np.ones(n_neighbors, dtype=X.dtype)
|
||||
|
||||
# this might raise a LinalgError if G is singular and has trace
|
||||
# zero
|
||||
for i, ind in enumerate(indices):
|
||||
A = Y[ind]
|
||||
C = A - X[i] # broadcasting
|
||||
G = np.dot(C, C.T)
|
||||
trace = np.trace(G)
|
||||
if trace > 0:
|
||||
R = reg * trace
|
||||
else:
|
||||
R = reg
|
||||
G.flat[:: n_neighbors + 1] += R
|
||||
w = solve(G, v, assume_a="pos")
|
||||
B[i, :] = w / np.sum(w)
|
||||
return B
|
||||
|
||||
|
||||
def barycenter_kneighbors_graph(X, n_neighbors, reg=1e-3, n_jobs=None):
|
||||
"""Computes the barycenter weighted graph of k-Neighbors for points in X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors for each sample.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Amount of regularization when solving the least-squares
|
||||
problem. Only relevant if mode='barycenter'. If None, use the
|
||||
default.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix in CSR format, shape = [n_samples, n_samples]
|
||||
A[i, j] is assigned the weight of edge that connects i to j.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neighbors.kneighbors_graph
|
||||
sklearn.neighbors.radius_neighbors_graph
|
||||
"""
|
||||
knn = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs).fit(X)
|
||||
X = knn._fit_X
|
||||
n_samples = knn.n_samples_fit_
|
||||
ind = knn.kneighbors(X, return_distance=False)[:, 1:]
|
||||
data = barycenter_weights(X, X, ind, reg=reg)
|
||||
indptr = np.arange(0, n_samples * n_neighbors + 1, n_neighbors)
|
||||
return csr_matrix((data.ravel(), ind.ravel(), indptr), shape=(n_samples, n_samples))
|
||||
|
||||
|
||||
def null_space(
|
||||
M, k, k_skip=1, eigen_solver="arpack", tol=1e-6, max_iter=100, random_state=None
|
||||
):
|
||||
"""
|
||||
Find the null space of a matrix M.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
M : {array, matrix, sparse matrix, LinearOperator}
|
||||
Input covariance matrix: should be symmetric positive semi-definite
|
||||
|
||||
k : int
|
||||
Number of eigenvalues/vectors to return
|
||||
|
||||
k_skip : int, default=1
|
||||
Number of low eigenvalues to skip.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='arpack'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for 'arpack' method.
|
||||
Not used if eigen_solver=='dense'
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
"""
|
||||
if eigen_solver == "auto":
|
||||
if M.shape[0] > 200 and k + k_skip < 10:
|
||||
eigen_solver = "arpack"
|
||||
else:
|
||||
eigen_solver = "dense"
|
||||
|
||||
if eigen_solver == "arpack":
|
||||
v0 = _init_arpack_v0(M.shape[0], random_state)
|
||||
try:
|
||||
eigen_values, eigen_vectors = eigsh(
|
||||
M, k + k_skip, sigma=0.0, tol=tol, maxiter=max_iter, v0=v0
|
||||
)
|
||||
except RuntimeError as e:
|
||||
raise ValueError(
|
||||
"Error in determining null-space with ARPACK. Error message: "
|
||||
"'%s'. Note that eigen_solver='arpack' can fail when the "
|
||||
"weight matrix is singular or otherwise ill-behaved. In that "
|
||||
"case, eigen_solver='dense' is recommended. See online "
|
||||
"documentation for more information." % e
|
||||
) from e
|
||||
|
||||
return eigen_vectors[:, k_skip:], np.sum(eigen_values[k_skip:])
|
||||
elif eigen_solver == "dense":
|
||||
if hasattr(M, "toarray"):
|
||||
M = M.toarray()
|
||||
eigen_values, eigen_vectors = eigh(
|
||||
M, subset_by_index=(k_skip, k + k_skip - 1), overwrite_a=True
|
||||
)
|
||||
index = np.argsort(np.abs(eigen_values))
|
||||
return eigen_vectors[:, index], np.sum(eigen_values)
|
||||
else:
|
||||
raise ValueError("Unrecognized eigen_solver '%s'" % eigen_solver)
|
||||
|
||||
|
||||
def _locally_linear_embedding(
|
||||
X,
|
||||
*,
|
||||
n_neighbors,
|
||||
n_components,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
nbrs = NearestNeighbors(n_neighbors=n_neighbors + 1, n_jobs=n_jobs)
|
||||
nbrs.fit(X)
|
||||
X = nbrs._fit_X
|
||||
|
||||
N, d_in = X.shape
|
||||
|
||||
if n_components > d_in:
|
||||
raise ValueError(
|
||||
"output dimension must be less than or equal to input dimension"
|
||||
)
|
||||
if n_neighbors >= N:
|
||||
raise ValueError(
|
||||
"Expected n_neighbors <= n_samples, but n_samples = %d, n_neighbors = %d"
|
||||
% (N, n_neighbors)
|
||||
)
|
||||
|
||||
M_sparse = eigen_solver != "dense"
|
||||
|
||||
if method == "standard":
|
||||
W = barycenter_kneighbors_graph(
|
||||
nbrs, n_neighbors=n_neighbors, reg=reg, n_jobs=n_jobs
|
||||
)
|
||||
|
||||
# we'll compute M = (I-W)'(I-W)
|
||||
# depending on the solver, we'll do this differently
|
||||
if M_sparse:
|
||||
M = eye(*W.shape, format=W.format) - W
|
||||
M = (M.T * M).tocsr()
|
||||
else:
|
||||
M = (W.T * W - W.T - W).toarray()
|
||||
M.flat[:: M.shape[0] + 1] += 1 # W = W - I = W - I
|
||||
|
||||
elif method == "hessian":
|
||||
dp = n_components * (n_components + 1) // 2
|
||||
|
||||
if n_neighbors <= n_components + dp:
|
||||
raise ValueError(
|
||||
"for method='hessian', n_neighbors must be "
|
||||
"greater than "
|
||||
"[n_components * (n_components + 3) / 2]"
|
||||
)
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
Yi = np.empty((n_neighbors, 1 + n_components + dp), dtype=np.float64)
|
||||
Yi[:, 0] = 1
|
||||
|
||||
M = np.zeros((N, N), dtype=np.float64)
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Gi = X[neighbors[i]]
|
||||
Gi -= Gi.mean(0)
|
||||
|
||||
# build Hessian estimator
|
||||
if use_svd:
|
||||
U = svd(Gi, full_matrices=0)[0]
|
||||
else:
|
||||
Ci = np.dot(Gi, Gi.T)
|
||||
U = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Yi[:, 1 : 1 + n_components] = U[:, :n_components]
|
||||
|
||||
j = 1 + n_components
|
||||
for k in range(n_components):
|
||||
Yi[:, j : j + n_components - k] = U[:, k : k + 1] * U[:, k:n_components]
|
||||
j += n_components - k
|
||||
|
||||
Q, R = qr(Yi)
|
||||
|
||||
w = Q[:, n_components + 1 :]
|
||||
S = w.sum(0)
|
||||
|
||||
S[np.where(abs(S) < hessian_tol)] = 1
|
||||
w /= S
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(w, w.T)
|
||||
|
||||
if M_sparse:
|
||||
M = csr_matrix(M)
|
||||
|
||||
elif method == "modified":
|
||||
if n_neighbors < n_components:
|
||||
raise ValueError("modified LLE requires n_neighbors >= n_components")
|
||||
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
# find the eigenvectors and eigenvalues of each local covariance
|
||||
# matrix. We want V[i] to be a [n_neighbors x n_neighbors] matrix,
|
||||
# where the columns are eigenvectors
|
||||
V = np.zeros((N, n_neighbors, n_neighbors))
|
||||
nev = min(d_in, n_neighbors)
|
||||
evals = np.zeros([N, nev])
|
||||
|
||||
# choose the most efficient way to find the eigenvectors
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
if use_svd:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
V[i], evals[i], _ = svd(X_nbrs, full_matrices=True)
|
||||
evals **= 2
|
||||
else:
|
||||
for i in range(N):
|
||||
X_nbrs = X[neighbors[i]] - X[i]
|
||||
C_nbrs = np.dot(X_nbrs, X_nbrs.T)
|
||||
evi, vi = eigh(C_nbrs)
|
||||
evals[i] = evi[::-1]
|
||||
V[i] = vi[:, ::-1]
|
||||
|
||||
# find regularized weights: this is like normal LLE.
|
||||
# because we've already computed the SVD of each covariance matrix,
|
||||
# it's faster to use this rather than np.linalg.solve
|
||||
reg = 1e-3 * evals.sum(1)
|
||||
|
||||
tmp = np.dot(V.transpose(0, 2, 1), np.ones(n_neighbors))
|
||||
tmp[:, :nev] /= evals + reg[:, None]
|
||||
tmp[:, nev:] /= reg[:, None]
|
||||
|
||||
w_reg = np.zeros((N, n_neighbors))
|
||||
for i in range(N):
|
||||
w_reg[i] = np.dot(V[i], tmp[i])
|
||||
w_reg /= w_reg.sum(1)[:, None]
|
||||
|
||||
# calculate eta: the median of the ratio of small to large eigenvalues
|
||||
# across the points. This is used to determine s_i, below
|
||||
rho = evals[:, n_components:].sum(1) / evals[:, :n_components].sum(1)
|
||||
eta = np.median(rho)
|
||||
|
||||
# find s_i, the size of the "almost null space" for each point:
|
||||
# this is the size of the largest set of eigenvalues
|
||||
# such that Sum[v; v in set]/Sum[v; v not in set] < eta
|
||||
s_range = np.zeros(N, dtype=int)
|
||||
evals_cumsum = stable_cumsum(evals, 1)
|
||||
eta_range = evals_cumsum[:, -1:] / evals_cumsum[:, :-1] - 1
|
||||
for i in range(N):
|
||||
s_range[i] = np.searchsorted(eta_range[i, ::-1], eta)
|
||||
s_range += n_neighbors - nev # number of zero eigenvalues
|
||||
|
||||
# Now calculate M.
|
||||
# This is the [N x N] matrix whose null space is the desired embedding
|
||||
M = np.zeros((N, N), dtype=np.float64)
|
||||
for i in range(N):
|
||||
s_i = s_range[i]
|
||||
|
||||
# select bottom s_i eigenvectors and calculate alpha
|
||||
Vi = V[i, :, n_neighbors - s_i :]
|
||||
alpha_i = np.linalg.norm(Vi.sum(0)) / np.sqrt(s_i)
|
||||
|
||||
# compute Householder matrix which satisfies
|
||||
# Hi*Vi.T*ones(n_neighbors) = alpha_i*ones(s)
|
||||
# using prescription from paper
|
||||
h = np.full(s_i, alpha_i) - np.dot(Vi.T, np.ones(n_neighbors))
|
||||
|
||||
norm_h = np.linalg.norm(h)
|
||||
if norm_h < modified_tol:
|
||||
h *= 0
|
||||
else:
|
||||
h /= norm_h
|
||||
|
||||
# Householder matrix is
|
||||
# >> Hi = np.identity(s_i) - 2*np.outer(h,h)
|
||||
# Then the weight matrix is
|
||||
# >> Wi = np.dot(Vi,Hi) + (1-alpha_i) * w_reg[i,:,None]
|
||||
# We do this much more efficiently:
|
||||
Wi = Vi - 2 * np.outer(np.dot(Vi, h), h) + (1 - alpha_i) * w_reg[i, :, None]
|
||||
|
||||
# Update M as follows:
|
||||
# >> W_hat = np.zeros( (N,s_i) )
|
||||
# >> W_hat[neighbors[i],:] = Wi
|
||||
# >> W_hat[i] -= 1
|
||||
# >> M += np.dot(W_hat,W_hat.T)
|
||||
# We can do this much more efficiently:
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] += np.dot(Wi, Wi.T)
|
||||
Wi_sum1 = Wi.sum(1)
|
||||
M[i, neighbors[i]] -= Wi_sum1
|
||||
M[neighbors[i], i] -= Wi_sum1
|
||||
M[i, i] += s_i
|
||||
|
||||
if M_sparse:
|
||||
M = csr_matrix(M)
|
||||
|
||||
elif method == "ltsa":
|
||||
neighbors = nbrs.kneighbors(
|
||||
X, n_neighbors=n_neighbors + 1, return_distance=False
|
||||
)
|
||||
neighbors = neighbors[:, 1:]
|
||||
|
||||
M = np.zeros((N, N))
|
||||
|
||||
use_svd = n_neighbors > d_in
|
||||
|
||||
for i in range(N):
|
||||
Xi = X[neighbors[i]]
|
||||
Xi -= Xi.mean(0)
|
||||
|
||||
# compute n_components largest eigenvalues of Xi * Xi^T
|
||||
if use_svd:
|
||||
v = svd(Xi, full_matrices=True)[0]
|
||||
else:
|
||||
Ci = np.dot(Xi, Xi.T)
|
||||
v = eigh(Ci)[1][:, ::-1]
|
||||
|
||||
Gi = np.zeros((n_neighbors, n_components + 1))
|
||||
Gi[:, 1:] = v[:, :n_components]
|
||||
Gi[:, 0] = 1.0 / np.sqrt(n_neighbors)
|
||||
|
||||
GiGiT = np.dot(Gi, Gi.T)
|
||||
|
||||
nbrs_x, nbrs_y = np.meshgrid(neighbors[i], neighbors[i])
|
||||
M[nbrs_x, nbrs_y] -= GiGiT
|
||||
M[neighbors[i], neighbors[i]] += 1
|
||||
|
||||
return null_space(
|
||||
M,
|
||||
n_components,
|
||||
k_skip=1,
|
||||
eigen_solver=eigen_solver,
|
||||
tol=tol,
|
||||
max_iter=max_iter,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", NearestNeighbors],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"reg": [Interval(Real, 0, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
|
||||
"hessian_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"modified_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [None, Integral],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def locally_linear_embedding(
|
||||
X,
|
||||
*,
|
||||
n_neighbors,
|
||||
n_components,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform a Locally Linear Embedding analysis on the data.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, NearestNeighbors}
|
||||
Sample data, shape = (n_samples, n_features), in the form of a
|
||||
numpy array or a NearestNeighbors object.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors to consider for each point.
|
||||
|
||||
n_components : int
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
auto : algorithm will attempt to choose the best method for input data
|
||||
|
||||
arpack : use arnoldi iteration in shift-invert mode.
|
||||
For this method, M may be a dense matrix, sparse matrix,
|
||||
or general linear operator.
|
||||
Warning: ARPACK can be unstable for some problems. It is
|
||||
best to try several random seeds in order to check results.
|
||||
|
||||
dense : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array
|
||||
or matrix type. This method should be avoided for
|
||||
large problems.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for the arpack solver.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
standard : use the standard locally linear embedding algorithm.
|
||||
see reference [1]_
|
||||
hessian : use the Hessian eigenmap method. This method requires
|
||||
n_neighbors > n_components * (1 + (n_components + 1) / 2.
|
||||
see reference [2]_
|
||||
modified : use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
ltsa : use local tangent space alignment algorithm
|
||||
see reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if method == 'hessian'.
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if method == 'modified'.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when ``solver`` == 'arpack'.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : ndarray of shape (n_samples, n_components)
|
||||
Embedding vectors.
|
||||
|
||||
squared_error : float
|
||||
Reconstruction error for the embedding vectors. Equivalent to
|
||||
``norm(Y - W Y, 'fro')**2``, where W are the reconstruction weights.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import locally_linear_embedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding, _ = locally_linear_embedding(X[:100],n_neighbors=5, n_components=2)
|
||||
>>> embedding.shape
|
||||
(100, 2)
|
||||
"""
|
||||
return _locally_linear_embedding(
|
||||
X=X,
|
||||
n_neighbors=n_neighbors,
|
||||
n_components=n_components,
|
||||
reg=reg,
|
||||
eigen_solver=eigen_solver,
|
||||
tol=tol,
|
||||
max_iter=max_iter,
|
||||
method=method,
|
||||
hessian_tol=hessian_tol,
|
||||
modified_tol=modified_tol,
|
||||
random_state=random_state,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
|
||||
|
||||
class LocallyLinearEmbedding(
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
TransformerMixin,
|
||||
_UnstableArchMixin,
|
||||
BaseEstimator,
|
||||
):
|
||||
"""Locally Linear Embedding.
|
||||
|
||||
Read more in the :ref:`User Guide <locally_linear_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to consider for each point.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of coordinates for the manifold.
|
||||
|
||||
reg : float, default=1e-3
|
||||
Regularization constant, multiplies the trace of the local covariance
|
||||
matrix of the distances.
|
||||
|
||||
eigen_solver : {'auto', 'arpack', 'dense'}, default='auto'
|
||||
The solver used to compute the eigenvectors. The available options are:
|
||||
|
||||
- `'auto'` : algorithm will attempt to choose the best method for input
|
||||
data.
|
||||
- `'arpack'` : use arnoldi iteration in shift-invert mode. For this
|
||||
method, M may be a dense matrix, sparse matrix, or general linear
|
||||
operator.
|
||||
- `'dense'` : use standard dense matrix operations for the eigenvalue
|
||||
decomposition. For this method, M must be an array or matrix type.
|
||||
This method should be avoided for large problems.
|
||||
|
||||
.. warning::
|
||||
ARPACK can be unstable for some problems. It is best to try several
|
||||
random seeds in order to check results.
|
||||
|
||||
tol : float, default=1e-6
|
||||
Tolerance for 'arpack' method
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations for the arpack solver.
|
||||
Not used if eigen_solver=='dense'.
|
||||
|
||||
method : {'standard', 'hessian', 'modified', 'ltsa'}, default='standard'
|
||||
- `standard`: use the standard locally linear embedding algorithm. see
|
||||
reference [1]_
|
||||
- `hessian`: use the Hessian eigenmap method. This method requires
|
||||
``n_neighbors > n_components * (1 + (n_components + 1) / 2``. see
|
||||
reference [2]_
|
||||
- `modified`: use the modified locally linear embedding algorithm.
|
||||
see reference [3]_
|
||||
- `ltsa`: use local tangent space alignment algorithm. see
|
||||
reference [4]_
|
||||
|
||||
hessian_tol : float, default=1e-4
|
||||
Tolerance for Hessian eigenmapping method.
|
||||
Only used if ``method == 'hessian'``.
|
||||
|
||||
modified_tol : float, default=1e-12
|
||||
Tolerance for modified LLE method.
|
||||
Only used if ``method == 'modified'``.
|
||||
|
||||
neighbors_algorithm : {'auto', 'brute', 'kd_tree', 'ball_tree'}, \
|
||||
default='auto'
|
||||
Algorithm to use for nearest neighbors search, passed to
|
||||
:class:`~sklearn.neighbors.NearestNeighbors` instance.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines the random number generator when
|
||||
``eigen_solver`` == 'arpack'. Pass an int for reproducible results
|
||||
across multiple function calls. See :term:`Glossary <random_state>`.
|
||||
|
||||
n_jobs : int or None, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : array-like, shape [n_samples, n_components]
|
||||
Stores the embedding vectors
|
||||
|
||||
reconstruction_error_ : float
|
||||
Reconstruction error associated with `embedding_`
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
nbrs_ : NearestNeighbors object
|
||||
Stores nearest neighbors instance, including BallTree or KDtree
|
||||
if applicable.
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality
|
||||
reduction.
|
||||
TSNE : Distributed Stochastic Neighbor Embedding.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [1] Roweis, S. & Saul, L. Nonlinear dimensionality reduction
|
||||
by locally linear embedding. Science 290:2323 (2000).
|
||||
.. [2] Donoho, D. & Grimes, C. Hessian eigenmaps: Locally
|
||||
linear embedding techniques for high-dimensional data.
|
||||
Proc Natl Acad Sci U S A. 100:5591 (2003).
|
||||
.. [3] `Zhang, Z. & Wang, J. MLLE: Modified Locally Linear
|
||||
Embedding Using Multiple Weights.
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/0b060fdbd92cbcc66b383bcaa9ba5e5e624d7ee3>`_
|
||||
.. [4] Zhang, Z. & Zha, H. Principal manifolds and nonlinear
|
||||
dimensionality reduction via tangent space alignment.
|
||||
Journal of Shanghai Univ. 8:406 (2004)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import LocallyLinearEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = LocallyLinearEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"reg": [Interval(Real, 0, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"auto", "arpack", "dense"})],
|
||||
"tol": [Interval(Real, 0, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"method": [StrOptions({"standard", "hessian", "modified", "ltsa"})],
|
||||
"hessian_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"modified_tol": [Interval(Real, 0, None, closed="left")],
|
||||
"neighbors_algorithm": [StrOptions({"auto", "brute", "kd_tree", "ball_tree"})],
|
||||
"random_state": ["random_state"],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
n_neighbors=5,
|
||||
n_components=2,
|
||||
reg=1e-3,
|
||||
eigen_solver="auto",
|
||||
tol=1e-6,
|
||||
max_iter=100,
|
||||
method="standard",
|
||||
hessian_tol=1e-4,
|
||||
modified_tol=1e-12,
|
||||
neighbors_algorithm="auto",
|
||||
random_state=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_components = n_components
|
||||
self.reg = reg
|
||||
self.eigen_solver = eigen_solver
|
||||
self.tol = tol
|
||||
self.max_iter = max_iter
|
||||
self.method = method
|
||||
self.hessian_tol = hessian_tol
|
||||
self.modified_tol = modified_tol
|
||||
self.random_state = random_state
|
||||
self.neighbors_algorithm = neighbors_algorithm
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _fit_transform(self, X):
|
||||
self.nbrs_ = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors,
|
||||
algorithm=self.neighbors_algorithm,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
X = self._validate_data(X, dtype=float)
|
||||
self.nbrs_.fit(X)
|
||||
self.embedding_, self.reconstruction_error_ = _locally_linear_embedding(
|
||||
X=self.nbrs_,
|
||||
n_neighbors=self.n_neighbors,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
tol=self.tol,
|
||||
max_iter=self.max_iter,
|
||||
method=self.method,
|
||||
hessian_tol=self.hessian_tol,
|
||||
modified_tol=self.modified_tol,
|
||||
random_state=random_state,
|
||||
reg=self.reg,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
self._n_features_out = self.embedding_.shape[1]
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted `LocallyLinearEmbedding` class instance.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Compute the embedding vectors for data X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like, shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
"""
|
||||
self._fit_transform(X)
|
||||
return self.embedding_
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform new points into embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
Returns the instance itself.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Because of scaling performed by this method, it is discouraged to use
|
||||
it together with methods that are not scale-invariant (like SVMs).
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._validate_data(X, reset=False)
|
||||
ind = self.nbrs_.kneighbors(
|
||||
X, n_neighbors=self.n_neighbors, return_distance=False
|
||||
)
|
||||
weights = barycenter_weights(X, self.nbrs_._fit_X, ind, reg=self.reg)
|
||||
X_new = np.empty((X.shape[0], self.n_components))
|
||||
for i in range(X.shape[0]):
|
||||
X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i])
|
||||
return X_new
|
||||
656
.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py
Normal file
656
.venv/lib/python3.12/site-packages/sklearn/manifold/_mds.py
Normal file
@@ -0,0 +1,656 @@
|
||||
"""
|
||||
Multi-dimensional Scaling (MDS).
|
||||
"""
|
||||
|
||||
# author: Nelle Varoquaux <nelle.varoquaux@gmail.com>
|
||||
# License: BSD
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from joblib import effective_n_jobs
|
||||
|
||||
from ..base import BaseEstimator, _fit_context
|
||||
from ..isotonic import IsotonicRegression
|
||||
from ..metrics import euclidean_distances
|
||||
from ..utils import check_array, check_random_state, check_symmetric
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.parallel import Parallel, delayed
|
||||
|
||||
|
||||
def _smacof_single(
|
||||
dissimilarities,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
random_state=None,
|
||||
normalized_stress=False,
|
||||
):
|
||||
"""Computes multidimensional scaling using SMACOF algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence. The value of `eps` should be tuned separately depending
|
||||
on whether or not `normalized_stress` is being used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
normalized_stress : bool, default=False
|
||||
Whether use and return normed stress value (Stress-1) instead of raw
|
||||
stress calculated by default. Only supported in non-metric MDS. The
|
||||
caller must ensure that if `normalized_stress=True` then `metric=False`
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
"""
|
||||
dissimilarities = check_symmetric(dissimilarities, raise_exception=True)
|
||||
|
||||
n_samples = dissimilarities.shape[0]
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
sim_flat = ((1 - np.tri(n_samples)) * dissimilarities).ravel()
|
||||
sim_flat_w = sim_flat[sim_flat != 0]
|
||||
if init is None:
|
||||
# Randomly choose initial configuration
|
||||
X = random_state.uniform(size=n_samples * n_components)
|
||||
X = X.reshape((n_samples, n_components))
|
||||
else:
|
||||
# overrides the parameter p
|
||||
n_components = init.shape[1]
|
||||
if n_samples != init.shape[0]:
|
||||
raise ValueError(
|
||||
"init matrix should be of shape (%d, %d)" % (n_samples, n_components)
|
||||
)
|
||||
X = init
|
||||
|
||||
old_stress = None
|
||||
ir = IsotonicRegression()
|
||||
for it in range(max_iter):
|
||||
# Compute distance and monotonic regression
|
||||
dis = euclidean_distances(X)
|
||||
|
||||
if metric:
|
||||
disparities = dissimilarities
|
||||
else:
|
||||
dis_flat = dis.ravel()
|
||||
# dissimilarities with 0 are considered as missing values
|
||||
dis_flat_w = dis_flat[sim_flat != 0]
|
||||
|
||||
# Compute the disparities using a monotonic regression
|
||||
disparities_flat = ir.fit_transform(sim_flat_w, dis_flat_w)
|
||||
disparities = dis_flat.copy()
|
||||
disparities[sim_flat != 0] = disparities_flat
|
||||
disparities = disparities.reshape((n_samples, n_samples))
|
||||
disparities *= np.sqrt(
|
||||
(n_samples * (n_samples - 1) / 2) / (disparities**2).sum()
|
||||
)
|
||||
|
||||
# Compute stress
|
||||
stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2
|
||||
if normalized_stress:
|
||||
stress = np.sqrt(stress / ((disparities.ravel() ** 2).sum() / 2))
|
||||
# Update X using the Guttman transform
|
||||
dis[dis == 0] = 1e-5
|
||||
ratio = disparities / dis
|
||||
B = -ratio
|
||||
B[np.arange(len(B)), np.arange(len(B))] += ratio.sum(axis=1)
|
||||
X = 1.0 / n_samples * np.dot(B, X)
|
||||
|
||||
dis = np.sqrt((X**2).sum(axis=1)).sum()
|
||||
if verbose >= 2:
|
||||
print("it: %d, stress %s" % (it, stress))
|
||||
if old_stress is not None:
|
||||
if (old_stress - stress / dis) < eps:
|
||||
if verbose:
|
||||
print("breaking at iteration %d with stress %s" % (it, stress))
|
||||
break
|
||||
old_stress = stress / dis
|
||||
|
||||
return X, stress, it + 1
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"dissimilarities": ["array-like"],
|
||||
"metric": ["boolean"],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"init": ["array-like", None],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_jobs": [Integral, None],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"verbose": ["verbose"],
|
||||
"eps": [Interval(Real, 0, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
"return_n_iter": ["boolean"],
|
||||
"normalized_stress": ["boolean", StrOptions({"auto"})],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def smacof(
|
||||
dissimilarities,
|
||||
*,
|
||||
metric=True,
|
||||
n_components=2,
|
||||
init=None,
|
||||
n_init=8,
|
||||
n_jobs=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
random_state=None,
|
||||
return_n_iter=False,
|
||||
normalized_stress="auto",
|
||||
):
|
||||
"""Compute multidimensional scaling using the SMACOF algorithm.
|
||||
|
||||
The SMACOF (Scaling by MAjorizing a COmplicated Function) algorithm is a
|
||||
multidimensional scaling algorithm which minimizes an objective function
|
||||
(the *stress*) using a majorization technique. Stress majorization, also
|
||||
known as the Guttman Transform, guarantees a monotone convergence of
|
||||
stress, and is more powerful than traditional techniques such as gradient
|
||||
descent.
|
||||
|
||||
The SMACOF algorithm for metric MDS can be summarized by the following
|
||||
steps:
|
||||
|
||||
1. Set an initial start configuration, randomly or not.
|
||||
2. Compute the stress
|
||||
3. Compute the Guttman Transform
|
||||
4. Iterate 2 and 3 until convergence.
|
||||
|
||||
The nonmetric algorithm adds a monotonic regression step before computing
|
||||
the stress.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dissimilarities : array-like of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Must be symmetric.
|
||||
|
||||
metric : bool, default=True
|
||||
Compute metric or nonmetric SMACOF algorithm.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities. If an
|
||||
``init`` array is provided, this option is overridden and the shape of
|
||||
``init`` is used to determine the dimensionality of the embedding
|
||||
space.
|
||||
|
||||
init : array-like of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the algorithm. By
|
||||
default, the algorithm is initialized with a randomly chosen array.
|
||||
|
||||
n_init : int, default=8
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress. If ``init`` is
|
||||
provided, this option is overridden and a single run is performed.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence. The value of `eps` should be tuned separately depending
|
||||
on whether or not `normalized_stress` is being used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
normalized_stress : bool or "auto" default="auto"
|
||||
Whether use and return normed stress value (Stress-1) instead of raw
|
||||
stress calculated by default. Only supported in non-metric MDS.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
The default value changed from `False` to `"auto"` in version 1.4.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_components)
|
||||
Coordinates of the points in a ``n_components``-space.
|
||||
|
||||
stress : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
n_iter : int
|
||||
The number of iterations corresponding to the best stress. Returned
|
||||
only if ``return_n_iter`` is set to ``True``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.manifold import smacof
|
||||
>>> from sklearn.metrics import euclidean_distances
|
||||
>>> X = np.array([[0, 1, 2], [1, 0, 3],[2, 3, 0]])
|
||||
>>> dissimilarities = euclidean_distances(X)
|
||||
>>> mds_result, stress = smacof(dissimilarities, n_components=2, random_state=42)
|
||||
>>> mds_result
|
||||
array([[ 0.05... -1.07... ],
|
||||
[ 1.74..., -0.75...],
|
||||
[-1.79..., 1.83...]])
|
||||
>>> stress
|
||||
np.float64(0.0012...)
|
||||
"""
|
||||
|
||||
dissimilarities = check_array(dissimilarities)
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if normalized_stress == "auto":
|
||||
normalized_stress = not metric
|
||||
|
||||
if normalized_stress and metric:
|
||||
raise ValueError(
|
||||
"Normalized stress is not supported for metric MDS. Either set"
|
||||
" `normalized_stress=False` or use `metric=False`."
|
||||
)
|
||||
if hasattr(init, "__array__"):
|
||||
init = np.asarray(init).copy()
|
||||
if not n_init == 1:
|
||||
warnings.warn(
|
||||
"Explicit initial positions passed: "
|
||||
"performing only one init of the MDS instead of %d" % n_init
|
||||
)
|
||||
n_init = 1
|
||||
|
||||
best_pos, best_stress = None, None
|
||||
|
||||
if effective_n_jobs(n_jobs) == 1:
|
||||
for it in range(n_init):
|
||||
pos, stress, n_iter_ = _smacof_single(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=random_state,
|
||||
normalized_stress=normalized_stress,
|
||||
)
|
||||
if best_stress is None or stress < best_stress:
|
||||
best_stress = stress
|
||||
best_pos = pos.copy()
|
||||
best_iter = n_iter_
|
||||
else:
|
||||
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
|
||||
results = Parallel(n_jobs=n_jobs, verbose=max(verbose - 1, 0))(
|
||||
delayed(_smacof_single)(
|
||||
dissimilarities,
|
||||
metric=metric,
|
||||
n_components=n_components,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
eps=eps,
|
||||
random_state=seed,
|
||||
normalized_stress=normalized_stress,
|
||||
)
|
||||
for seed in seeds
|
||||
)
|
||||
positions, stress, n_iters = zip(*results)
|
||||
best = np.argmin(stress)
|
||||
best_stress = stress[best]
|
||||
best_pos = positions[best]
|
||||
best_iter = n_iters[best]
|
||||
|
||||
if return_n_iter:
|
||||
return best_pos, best_stress, best_iter
|
||||
else:
|
||||
return best_pos, best_stress
|
||||
|
||||
|
||||
class MDS(BaseEstimator):
|
||||
"""Multidimensional scaling.
|
||||
|
||||
Read more in the :ref:`User Guide <multidimensional_scaling>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
Number of dimensions in which to immerse the dissimilarities.
|
||||
|
||||
metric : bool, default=True
|
||||
If ``True``, perform metric MDS; otherwise, perform nonmetric MDS.
|
||||
When ``False`` (i.e. non-metric MDS), dissimilarities with 0 are considered as
|
||||
missing values.
|
||||
|
||||
n_init : int, default=4
|
||||
Number of times the SMACOF algorithm will be run with different
|
||||
initializations. The final results will be the best output of the runs,
|
||||
determined by the run with the smallest final stress.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the SMACOF algorithm for a single run.
|
||||
|
||||
verbose : int, default=0
|
||||
Level of verbosity.
|
||||
|
||||
eps : float, default=1e-3
|
||||
Relative tolerance with respect to stress at which to declare
|
||||
convergence. The value of `eps` should be tuned separately depending
|
||||
on whether or not `normalized_stress` is being used.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of jobs to use for the computation. If multiple
|
||||
initializations are used (``n_init``), each run of the algorithm is
|
||||
computed in parallel.
|
||||
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines the random number generator used to initialize the centers.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
dissimilarity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Dissimilarity measure to use:
|
||||
|
||||
- 'euclidean':
|
||||
Pairwise Euclidean distances between points in the dataset.
|
||||
|
||||
- 'precomputed':
|
||||
Pre-computed dissimilarities are passed directly to ``fit`` and
|
||||
``fit_transform``.
|
||||
|
||||
normalized_stress : bool or "auto" default="auto"
|
||||
Whether use and return normed stress value (Stress-1) instead of raw
|
||||
stress calculated by default. Only supported in non-metric MDS.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
.. versionchanged:: 1.4
|
||||
The default value changed from `False` to `"auto"` in version 1.4.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Stores the position of the dataset in the embedding space.
|
||||
|
||||
stress_ : float
|
||||
The final value of the stress (sum of squared distance of the
|
||||
disparities and the distances for all constrained points).
|
||||
If `normalized_stress=True`, and `metric=False` returns Stress-1.
|
||||
A value of 0 indicates "perfect" fit, 0.025 excellent, 0.05 good,
|
||||
0.1 fair, and 0.2 poor [1]_.
|
||||
|
||||
dissimilarity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Pairwise dissimilarities between the points. Symmetric matrix that:
|
||||
|
||||
- either uses a custom dissimilarity matrix by setting `dissimilarity`
|
||||
to 'precomputed';
|
||||
- or constructs a dissimilarity matrix from data using
|
||||
Euclidean distances.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_iter_ : int
|
||||
The number of iterations corresponding to the best stress.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.decomposition.PCA : Principal component analysis that is a linear
|
||||
dimensionality reduction method.
|
||||
sklearn.decomposition.KernelPCA : Non-linear dimensionality reduction using
|
||||
kernels and PCA.
|
||||
TSNE : T-distributed Stochastic Neighbor Embedding.
|
||||
Isomap : Manifold learning based on Isometric Mapping.
|
||||
LocallyLinearEmbedding : Manifold learning using Locally Linear Embedding.
|
||||
SpectralEmbedding : Spectral embedding for non-linear dimensionality.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Nonmetric multidimensional scaling: a numerical method" Kruskal, J.
|
||||
Psychometrika, 29 (1964)
|
||||
|
||||
.. [2] "Multidimensional scaling by optimizing goodness of fit to a nonmetric
|
||||
hypothesis" Kruskal, J. Psychometrika, 29, (1964)
|
||||
|
||||
.. [3] "Modern Multidimensional Scaling - Theory and Applications" Borg, I.;
|
||||
Groenen P. Springer Series in Statistics (1997)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import MDS
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = MDS(n_components=2, normalized_stress='auto')
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
|
||||
For a more detailed example of usage, see
|
||||
:ref:`sphx_glr_auto_examples_manifold_plot_mds.py`.
|
||||
|
||||
For a comparison of manifold learning techniques, see
|
||||
:ref:`sphx_glr_auto_examples_manifold_plot_compare_methods.py`.
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric": ["boolean"],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"verbose": ["verbose"],
|
||||
"eps": [Interval(Real, 0.0, None, closed="left")],
|
||||
"n_jobs": [None, Integral],
|
||||
"random_state": ["random_state"],
|
||||
"dissimilarity": [StrOptions({"euclidean", "precomputed"})],
|
||||
"normalized_stress": ["boolean", StrOptions({"auto"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
metric=True,
|
||||
n_init=4,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
eps=1e-3,
|
||||
n_jobs=None,
|
||||
random_state=None,
|
||||
dissimilarity="euclidean",
|
||||
normalized_stress="auto",
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.dissimilarity = dissimilarity
|
||||
self.metric = metric
|
||||
self.n_init = n_init
|
||||
self.max_iter = max_iter
|
||||
self.eps = eps
|
||||
self.verbose = verbose
|
||||
self.n_jobs = n_jobs
|
||||
self.random_state = random_state
|
||||
self.normalized_stress = normalized_stress
|
||||
|
||||
def _more_tags(self):
|
||||
return {"pairwise": self.dissimilarity == "precomputed"}
|
||||
|
||||
def fit(self, X, y=None, init=None):
|
||||
"""
|
||||
Compute the position of the points in the embedding space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Fitted estimator.
|
||||
"""
|
||||
self.fit_transform(X, init=init)
|
||||
return self
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit_transform(self, X, y=None, init=None):
|
||||
"""
|
||||
Fit the data from `X`, and returns the embedded coordinates.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
Input data. If ``dissimilarity=='precomputed'``, the input should
|
||||
be the dissimilarity matrix.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
init : ndarray of shape (n_samples, n_components), default=None
|
||||
Starting configuration of the embedding to initialize the SMACOF
|
||||
algorithm. By default, the algorithm is initialized with a randomly
|
||||
chosen array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : ndarray of shape (n_samples, n_components)
|
||||
X transformed in the new space.
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
|
||||
warnings.warn(
|
||||
"The MDS API has changed. ``fit`` now constructs an"
|
||||
" dissimilarity matrix from data. To use a custom "
|
||||
"dissimilarity matrix, set "
|
||||
"``dissimilarity='precomputed'``."
|
||||
)
|
||||
|
||||
if self.dissimilarity == "precomputed":
|
||||
self.dissimilarity_matrix_ = X
|
||||
elif self.dissimilarity == "euclidean":
|
||||
self.dissimilarity_matrix_ = euclidean_distances(X)
|
||||
|
||||
self.embedding_, self.stress_, self.n_iter_ = smacof(
|
||||
self.dissimilarity_matrix_,
|
||||
metric=self.metric,
|
||||
n_components=self.n_components,
|
||||
init=init,
|
||||
n_init=self.n_init,
|
||||
n_jobs=self.n_jobs,
|
||||
max_iter=self.max_iter,
|
||||
verbose=self.verbose,
|
||||
eps=self.eps,
|
||||
random_state=self.random_state,
|
||||
return_n_iter=True,
|
||||
normalized_stress=self.normalized_stress,
|
||||
)
|
||||
|
||||
return self.embedding_
|
||||
@@ -0,0 +1,778 @@
|
||||
"""Spectral Embedding."""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Wei LI <kuantkid@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.csgraph import connected_components
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from ..base import BaseEstimator, _fit_context
|
||||
from ..metrics.pairwise import rbf_kernel
|
||||
from ..neighbors import NearestNeighbors, kneighbors_graph
|
||||
from ..utils import (
|
||||
check_array,
|
||||
check_random_state,
|
||||
check_symmetric,
|
||||
)
|
||||
from ..utils._arpack import _init_arpack_v0
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.extmath import _deterministic_vector_sign_flip
|
||||
from ..utils.fixes import laplacian as csgraph_laplacian
|
||||
from ..utils.fixes import parse_version, sp_version
|
||||
|
||||
|
||||
def _graph_connected_component(graph, node_id):
|
||||
"""Find the largest graph connected components that contains one
|
||||
given node.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : array-like of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
node_id : int
|
||||
The index of the query node of the graph.
|
||||
|
||||
Returns
|
||||
-------
|
||||
connected_components_matrix : array-like of shape (n_samples,)
|
||||
An array of bool value indicating the indexes of the nodes
|
||||
belonging to the largest connected components of the given query
|
||||
node.
|
||||
"""
|
||||
n_node = graph.shape[0]
|
||||
if sparse.issparse(graph):
|
||||
# speed up row-wise access to boolean connection mask
|
||||
graph = graph.tocsr()
|
||||
connected_nodes = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore = np.zeros(n_node, dtype=bool)
|
||||
nodes_to_explore[node_id] = True
|
||||
for _ in range(n_node):
|
||||
last_num_component = connected_nodes.sum()
|
||||
np.logical_or(connected_nodes, nodes_to_explore, out=connected_nodes)
|
||||
if last_num_component >= connected_nodes.sum():
|
||||
break
|
||||
indices = np.where(nodes_to_explore)[0]
|
||||
nodes_to_explore.fill(False)
|
||||
for i in indices:
|
||||
if sparse.issparse(graph):
|
||||
# scipy not yet implemented 1D sparse slices; can be changed back to
|
||||
# `neighbors = graph[i].toarray().ravel()` once implemented
|
||||
neighbors = graph[[i], :].toarray().ravel()
|
||||
else:
|
||||
neighbors = graph[i]
|
||||
np.logical_or(nodes_to_explore, neighbors, out=nodes_to_explore)
|
||||
return connected_nodes
|
||||
|
||||
|
||||
def _graph_is_connected(graph):
|
||||
"""Return whether the graph is connected (True) or Not (False).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
graph : {array-like, sparse matrix} of shape (n_samples, n_samples)
|
||||
Adjacency matrix of the graph, non-zero weight means an edge
|
||||
between the nodes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_connected : bool
|
||||
True means the graph is fully connected and False means not.
|
||||
"""
|
||||
if sparse.issparse(graph):
|
||||
# Before Scipy 1.11.3, `connected_components` only supports 32-bit indices.
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
# TODO(jjerphan): Once SciPy 1.11.3 is the minimum supported version, use
|
||||
# `accept_large_sparse=True`.
|
||||
accept_large_sparse = sp_version >= parse_version("1.11.3")
|
||||
graph = check_array(
|
||||
graph, accept_sparse=True, accept_large_sparse=accept_large_sparse
|
||||
)
|
||||
# sparse graph, find all the connected components
|
||||
n_connected_components, _ = connected_components(graph)
|
||||
return n_connected_components == 1
|
||||
else:
|
||||
# dense graph, find all connected components start from node 0
|
||||
return _graph_connected_component(graph, 0).sum() == graph.shape[0]
|
||||
|
||||
|
||||
def _set_diag(laplacian, value, norm_laplacian):
|
||||
"""Set the diagonal of the laplacian matrix and convert it to a
|
||||
sparse format well suited for eigenvalue decomposition.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
laplacian : {ndarray, sparse matrix}
|
||||
The graph laplacian.
|
||||
|
||||
value : float
|
||||
The value of the diagonal.
|
||||
|
||||
norm_laplacian : bool
|
||||
Whether the value of the diagonal should be changed or not.
|
||||
|
||||
Returns
|
||||
-------
|
||||
laplacian : {array, sparse matrix}
|
||||
An array of matrix in a form that is well suited to fast
|
||||
eigenvalue decomposition, depending on the band width of the
|
||||
matrix.
|
||||
"""
|
||||
n_nodes = laplacian.shape[0]
|
||||
# We need all entries in the diagonal to values
|
||||
if not sparse.issparse(laplacian):
|
||||
if norm_laplacian:
|
||||
laplacian.flat[:: n_nodes + 1] = value
|
||||
else:
|
||||
laplacian = laplacian.tocoo()
|
||||
if norm_laplacian:
|
||||
diag_idx = laplacian.row == laplacian.col
|
||||
laplacian.data[diag_idx] = value
|
||||
# If the matrix has a small number of diagonals (as in the
|
||||
# case of structured matrices coming from images), the
|
||||
# dia format might be best suited for matvec products:
|
||||
n_diags = np.unique(laplacian.row - laplacian.col).size
|
||||
if n_diags <= 7:
|
||||
# 3 or less outer diagonals on each side
|
||||
laplacian = laplacian.todia()
|
||||
else:
|
||||
# csr has the fastest matvec and is thus best suited to
|
||||
# arpack
|
||||
laplacian = laplacian.tocsr()
|
||||
return laplacian
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"adjacency": ["array-like", "sparse matrix"],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"norm_laplacian": ["boolean"],
|
||||
"drop_first": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
"""Project the sample on the first eigenvectors of the graph Laplacian.
|
||||
|
||||
The adjacency matrix is used to compute a normalized graph Laplacian
|
||||
whose spectrum (especially the eigenvectors associated to the
|
||||
smallest eigenvalues) has an interpretation in terms of minimal
|
||||
number of cuts necessary to split the graph into comparably sized
|
||||
components.
|
||||
|
||||
This embedding can also 'work' even if the ``adjacency`` variable is
|
||||
not strictly the adjacency matrix of a graph but more generally
|
||||
an affinity or similarity matrix between samples (for instance the
|
||||
heat kernel of a euclidean distance matrix or a k-NN matrix).
|
||||
|
||||
However care must taken to always make the affinity matrix symmetric
|
||||
so that the eigenvector decomposition works as expected.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
adjacency : {array-like, sparse graph} of shape (n_samples, n_samples)
|
||||
The adjacency matrix of the graph to embed.
|
||||
|
||||
n_components : int, default=8
|
||||
The dimension of the projection subspace.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems,
|
||||
but may also lead to instabilities. If None, then ``'arpack'`` is
|
||||
used.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
|
||||
to convergence issues and should be avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
Added 'auto' option.
|
||||
|
||||
norm_laplacian : bool, default=True
|
||||
If True, then compute symmetric normalized Laplacian.
|
||||
|
||||
drop_first : bool, default=True
|
||||
Whether to drop the first eigenvector. For spectral embedding, this
|
||||
should be True as the first eigenvector should be constant vector for
|
||||
connected graph, but for spectral clustering, this should be kept as
|
||||
False to retain the first eigenvector.
|
||||
|
||||
Returns
|
||||
-------
|
||||
embedding : ndarray of shape (n_samples, n_components)
|
||||
The reduced samples.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Spectral Embedding (Laplacian Eigenmaps) is most useful when the graph
|
||||
has one connected component. If there graph has many components, the first
|
||||
few eigenvectors will simply uncover the connected components of the graph.
|
||||
|
||||
References
|
||||
----------
|
||||
* https://en.wikipedia.org/wiki/LOBPCG
|
||||
|
||||
* :doi:`"Toward the Optimal Preconditioned Eigensolver: Locally Optimal
|
||||
Block Preconditioned Conjugate Gradient Method",
|
||||
Andrew V. Knyazev
|
||||
<10.1137/S1064827500366124>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.neighbors import kneighbors_graph
|
||||
>>> from sklearn.manifold import spectral_embedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X = X[:100]
|
||||
>>> affinity_matrix = kneighbors_graph(
|
||||
... X, n_neighbors=int(X.shape[0] / 10), include_self=True
|
||||
... )
|
||||
>>> # make the matrix symmetric
|
||||
>>> affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)
|
||||
>>> embedding = spectral_embedding(affinity_matrix, n_components=2, random_state=42)
|
||||
>>> embedding.shape
|
||||
(100, 2)
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
return _spectral_embedding(
|
||||
adjacency,
|
||||
n_components=n_components,
|
||||
eigen_solver=eigen_solver,
|
||||
random_state=random_state,
|
||||
eigen_tol=eigen_tol,
|
||||
norm_laplacian=norm_laplacian,
|
||||
drop_first=drop_first,
|
||||
)
|
||||
|
||||
|
||||
def _spectral_embedding(
|
||||
adjacency,
|
||||
*,
|
||||
n_components=8,
|
||||
eigen_solver=None,
|
||||
random_state=None,
|
||||
eigen_tol="auto",
|
||||
norm_laplacian=True,
|
||||
drop_first=True,
|
||||
):
|
||||
adjacency = check_symmetric(adjacency)
|
||||
|
||||
if eigen_solver == "amg":
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
"The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
) from e
|
||||
|
||||
if eigen_solver is None:
|
||||
eigen_solver = "arpack"
|
||||
|
||||
n_nodes = adjacency.shape[0]
|
||||
# Whether to drop the first eigenvector
|
||||
if drop_first:
|
||||
n_components = n_components + 1
|
||||
|
||||
if not _graph_is_connected(adjacency):
|
||||
warnings.warn(
|
||||
"Graph is not fully connected, spectral embedding may not work as expected."
|
||||
)
|
||||
|
||||
laplacian, dd = csgraph_laplacian(
|
||||
adjacency, normed=norm_laplacian, return_diag=True
|
||||
)
|
||||
if (
|
||||
eigen_solver == "arpack"
|
||||
or eigen_solver != "lobpcg"
|
||||
and (not sparse.issparse(laplacian) or n_nodes < 5 * n_components)
|
||||
):
|
||||
# lobpcg used with eigen_solver='amg' has bugs for low number of nodes
|
||||
# for details see the source code in scipy:
|
||||
# https://github.com/scipy/scipy/blob/v0.11.0/scipy/sparse/linalg/eigen
|
||||
# /lobpcg/lobpcg.py#L237
|
||||
# or matlab:
|
||||
# https://www.mathworks.com/matlabcentral/fileexchange/48-lobpcg-m
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# Here we'll use shift-invert mode for fast eigenvalues
|
||||
# (see https://docs.scipy.org/doc/scipy/reference/tutorial/arpack.html
|
||||
# for a short explanation of what this means)
|
||||
# Because the normalized Laplacian has eigenvalues between 0 and 2,
|
||||
# I - L has eigenvalues between -1 and 1. ARPACK is most efficient
|
||||
# when finding eigenvalues of largest magnitude (keyword which='LM')
|
||||
# and when these eigenvalues are very large compared to the rest.
|
||||
# For very large, very sparse graphs, I - L can have many, many
|
||||
# eigenvalues very near 1.0. This leads to slow convergence. So
|
||||
# instead, we'll use ARPACK's shift-invert mode, asking for the
|
||||
# eigenvalues near 1.0. This effectively spreads-out the spectrum
|
||||
# near 1.0 and leads to much faster convergence: potentially an
|
||||
# orders-of-magnitude speedup over simply using keyword which='LA'
|
||||
# in standard mode.
|
||||
try:
|
||||
# We are computing the opposite of the laplacian inplace so as
|
||||
# to spare a memory allocation of a possibly very large array
|
||||
tol = 0 if eigen_tol == "auto" else eigen_tol
|
||||
laplacian *= -1
|
||||
v0 = _init_arpack_v0(laplacian.shape[0], random_state)
|
||||
laplacian = check_array(
|
||||
laplacian, accept_sparse="csr", accept_large_sparse=False
|
||||
)
|
||||
_, diffusion_map = eigsh(
|
||||
laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
|
||||
)
|
||||
embedding = diffusion_map.T[n_components::-1]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
except RuntimeError:
|
||||
# When submatrices are exactly singular, an LU decomposition
|
||||
# in arpack fails. We fallback to lobpcg
|
||||
eigen_solver = "lobpcg"
|
||||
# Revert the laplacian to its opposite to have lobpcg work
|
||||
laplacian *= -1
|
||||
|
||||
elif eigen_solver == "amg":
|
||||
# Use AMG to get a preconditioner and speed up the eigenvalue
|
||||
# problem.
|
||||
if not sparse.issparse(laplacian):
|
||||
warnings.warn("AMG works better for sparse matrices")
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
|
||||
# The Laplacian matrix is always singular, having at least one zero
|
||||
# eigenvalue, corresponding to the trivial eigenvector, which is a
|
||||
# constant. Using a singular matrix for preconditioning may result in
|
||||
# random failures in LOBPCG and is not supported by the existing
|
||||
# theory:
|
||||
# see https://doi.org/10.1007/s10208-015-9297-1
|
||||
# Shift the Laplacian so its diagononal is not all ones. The shift
|
||||
# does change the eigenpairs however, so we'll feed the shifted
|
||||
# matrix to the solver and afterward set it back to the original.
|
||||
diag_shift = 1e-5 * sparse.eye(laplacian.shape[0])
|
||||
laplacian += diag_shift
|
||||
if hasattr(sparse, "csr_array") and isinstance(laplacian, sparse.csr_array):
|
||||
# `pyamg` does not work with `csr_array` and we need to convert it to a
|
||||
# `csr_matrix` object.
|
||||
laplacian = sparse.csr_matrix(laplacian)
|
||||
ml = smoothed_aggregation_solver(check_array(laplacian, accept_sparse="csr"))
|
||||
laplacian -= diag_shift
|
||||
|
||||
M = ml.aspreconditioner()
|
||||
# Create initial approximation X to eigenvectors
|
||||
X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
|
||||
embedding = diffusion_map.T
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
if eigen_solver == "lobpcg":
|
||||
laplacian = check_array(
|
||||
laplacian, dtype=[np.float64, np.float32], accept_sparse=True
|
||||
)
|
||||
if n_nodes < 5 * n_components + 1:
|
||||
# see note above under arpack why lobpcg has problems with small
|
||||
# number of nodes
|
||||
# lobpcg will fallback to eigh, so we short circuit it
|
||||
if sparse.issparse(laplacian):
|
||||
laplacian = laplacian.toarray()
|
||||
_, diffusion_map = eigh(laplacian, check_finite=False)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
else:
|
||||
laplacian = _set_diag(laplacian, 1, norm_laplacian)
|
||||
# We increase the number of eigenvectors requested, as lobpcg
|
||||
# doesn't behave well in low dimension and create initial
|
||||
# approximation X to eigenvectors
|
||||
X = random_state.standard_normal(
|
||||
size=(laplacian.shape[0], n_components + 1)
|
||||
)
|
||||
X[:, 0] = dd.ravel()
|
||||
X = X.astype(laplacian.dtype)
|
||||
tol = None if eigen_tol == "auto" else eigen_tol
|
||||
_, diffusion_map = lobpcg(
|
||||
laplacian, X, tol=tol, largest=False, maxiter=2000
|
||||
)
|
||||
embedding = diffusion_map.T[:n_components]
|
||||
if norm_laplacian:
|
||||
# recover u = D^-1/2 x from the eigenvector output x
|
||||
embedding = embedding / dd
|
||||
if embedding.shape[0] == 1:
|
||||
raise ValueError
|
||||
|
||||
embedding = _deterministic_vector_sign_flip(embedding)
|
||||
if drop_first:
|
||||
return embedding[1:n_components].T
|
||||
else:
|
||||
return embedding[:n_components].T
|
||||
|
||||
|
||||
class SpectralEmbedding(BaseEstimator):
|
||||
"""Spectral embedding for non-linear dimensionality reduction.
|
||||
|
||||
Forms an affinity matrix given by the specified function and
|
||||
applies spectral decomposition to the corresponding graph laplacian.
|
||||
The resulting transformation is given by the value of the
|
||||
eigenvectors for each data point.
|
||||
|
||||
Note : Laplacian Eigenmaps is the actual algorithm implemented here.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_embedding>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=2
|
||||
The dimension of the projected subspace.
|
||||
|
||||
affinity : {'nearest_neighbors', 'rbf', 'precomputed', \
|
||||
'precomputed_nearest_neighbors'} or callable, \
|
||||
default='nearest_neighbors'
|
||||
How to construct the affinity matrix.
|
||||
- 'nearest_neighbors' : construct the affinity matrix by computing a
|
||||
graph of nearest neighbors.
|
||||
- 'rbf' : construct the affinity matrix by computing a radial basis
|
||||
function (RBF) kernel.
|
||||
- 'precomputed' : interpret ``X`` as a precomputed affinity matrix.
|
||||
- 'precomputed_nearest_neighbors' : interpret ``X`` as a sparse graph
|
||||
of precomputed nearest neighbors, and constructs the affinity matrix
|
||||
by selecting the ``n_neighbors`` nearest neighbors.
|
||||
- callable : use passed in function as affinity
|
||||
the function takes in data matrix (n_samples, n_features)
|
||||
and return affinity matrix (n_samples, n_samples).
|
||||
|
||||
gamma : float, default=None
|
||||
Kernel coefficient for rbf kernel. If None, gamma will be set to
|
||||
1/n_features.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
A pseudo random number generator used for the initialization
|
||||
of the lobpcg eigen vectors decomposition when `eigen_solver ==
|
||||
'amg'`, and for the K-Means initialization. Use an int to make
|
||||
the results deterministic across calls (See
|
||||
:term:`Glossary <random_state>`).
|
||||
|
||||
.. note::
|
||||
When using `eigen_solver == 'amg'`,
|
||||
it is necessary to also fix the global numpy seed with
|
||||
`np.random.seed(int)` to get deterministic results. See
|
||||
https://github.com/pyamg/pyamg/issues/139 for further
|
||||
information.
|
||||
|
||||
eigen_solver : {'arpack', 'lobpcg', 'amg'}, default=None
|
||||
The eigenvalue decomposition strategy to use. AMG requires pyamg
|
||||
to be installed. It can be faster on very large, sparse problems.
|
||||
If None, then ``'arpack'`` is used.
|
||||
|
||||
eigen_tol : float, default="auto"
|
||||
Stopping criterion for eigendecomposition of the Laplacian matrix.
|
||||
If `eigen_tol="auto"` then the passed tolerance will depend on the
|
||||
`eigen_solver`:
|
||||
|
||||
- If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
|
||||
- If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
|
||||
`eigen_tol=None` which configures the underlying `lobpcg` solver to
|
||||
automatically resolve the value according to their heuristics. See,
|
||||
:func:`scipy.sparse.linalg.lobpcg` for details.
|
||||
|
||||
Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
|
||||
values of `tol<1e-5` may lead to convergence issues and should be
|
||||
avoided.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
n_neighbors : int, default=None
|
||||
Number of nearest neighbors for nearest_neighbors graph building.
|
||||
If None, n_neighbors will be set to max(n_samples/10, 1).
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
embedding_ : ndarray of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Affinity_matrix constructed from samples or precomputed.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
n_neighbors_ : int
|
||||
Number of nearest neighbors effectively used.
|
||||
|
||||
See Also
|
||||
--------
|
||||
Isomap : Non-linear dimensionality reduction through Isometric Mapping.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
- :doi:`A Tutorial on Spectral Clustering, 2007
|
||||
Ulrike von Luxburg
|
||||
<10.1007/s11222-007-9033-z>`
|
||||
|
||||
- `On Spectral Clustering: Analysis and an algorithm, 2001
|
||||
Andrew Y. Ng, Michael I. Jordan, Yair Weiss
|
||||
<https://citeseerx.ist.psu.edu/doc_view/pid/796c5d6336fc52aa84db575fb821c78918b65f58>`_
|
||||
|
||||
- :doi:`Normalized cuts and image segmentation, 2000
|
||||
Jianbo Shi, Jitendra Malik
|
||||
<10.1109/34.868688>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.datasets import load_digits
|
||||
>>> from sklearn.manifold import SpectralEmbedding
|
||||
>>> X, _ = load_digits(return_X_y=True)
|
||||
>>> X.shape
|
||||
(1797, 64)
|
||||
>>> embedding = SpectralEmbedding(n_components=2)
|
||||
>>> X_transformed = embedding.fit_transform(X[:100])
|
||||
>>> X_transformed.shape
|
||||
(100, 2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"affinity": [
|
||||
StrOptions(
|
||||
{
|
||||
"nearest_neighbors",
|
||||
"rbf",
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
},
|
||||
),
|
||||
callable,
|
||||
],
|
||||
"gamma": [Interval(Real, 0, None, closed="left"), None],
|
||||
"random_state": ["random_state"],
|
||||
"eigen_solver": [StrOptions({"arpack", "lobpcg", "amg"}), None],
|
||||
"eigen_tol": [Interval(Real, 0, None, closed="left"), StrOptions({"auto"})],
|
||||
"n_neighbors": [Interval(Integral, 1, None, closed="left"), None],
|
||||
"n_jobs": [None, Integral],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_components=2,
|
||||
*,
|
||||
affinity="nearest_neighbors",
|
||||
gamma=None,
|
||||
random_state=None,
|
||||
eigen_solver=None,
|
||||
eigen_tol="auto",
|
||||
n_neighbors=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.n_components = n_components
|
||||
self.affinity = affinity
|
||||
self.gamma = gamma
|
||||
self.random_state = random_state
|
||||
self.eigen_solver = eigen_solver
|
||||
self.eigen_tol = eigen_tol
|
||||
self.n_neighbors = n_neighbors
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"pairwise": self.affinity
|
||||
in [
|
||||
"precomputed",
|
||||
"precomputed_nearest_neighbors",
|
||||
]
|
||||
}
|
||||
|
||||
def _get_affinity_matrix(self, X, Y=None):
|
||||
"""Calculate the affinity matrix from data
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : array-like of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
Y: Ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
affinity_matrix of shape (n_samples, n_samples)
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
self.affinity_matrix_ = X
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "precomputed_nearest_neighbors":
|
||||
estimator = NearestNeighbors(
|
||||
n_neighbors=self.n_neighbors, n_jobs=self.n_jobs, metric="precomputed"
|
||||
).fit(X)
|
||||
connectivity = estimator.kneighbors_graph(X=X, mode="connectivity")
|
||||
self.affinity_matrix_ = 0.5 * (connectivity + connectivity.T)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "nearest_neighbors":
|
||||
if sparse.issparse(X):
|
||||
warnings.warn(
|
||||
"Nearest neighbors affinity currently does "
|
||||
"not support sparse input, falling back to "
|
||||
"rbf affinity"
|
||||
)
|
||||
self.affinity = "rbf"
|
||||
else:
|
||||
self.n_neighbors_ = (
|
||||
self.n_neighbors
|
||||
if self.n_neighbors is not None
|
||||
else max(int(X.shape[0] / 10), 1)
|
||||
)
|
||||
self.affinity_matrix_ = kneighbors_graph(
|
||||
X, self.n_neighbors_, include_self=True, n_jobs=self.n_jobs
|
||||
)
|
||||
# currently only symmetric affinity_matrix supported
|
||||
self.affinity_matrix_ = 0.5 * (
|
||||
self.affinity_matrix_ + self.affinity_matrix_.T
|
||||
)
|
||||
return self.affinity_matrix_
|
||||
if self.affinity == "rbf":
|
||||
self.gamma_ = self.gamma if self.gamma is not None else 1.0 / X.shape[1]
|
||||
self.affinity_matrix_ = rbf_kernel(X, gamma=self.gamma_)
|
||||
return self.affinity_matrix_
|
||||
self.affinity_matrix_ = self.affinity(X)
|
||||
return self.affinity_matrix_
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model from data in X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix}, shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns the instance itself.
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
affinity_matrix = self._get_affinity_matrix(X)
|
||||
self.embedding_ = _spectral_embedding(
|
||||
affinity_matrix,
|
||||
n_components=self.n_components,
|
||||
eigen_solver=self.eigen_solver,
|
||||
eigen_tol=self.eigen_tol,
|
||||
random_state=random_state,
|
||||
)
|
||||
return self
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit the model from data in X and transform X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
If affinity is "precomputed"
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_samples),
|
||||
Interpret X as precomputed adjacency graph computed from
|
||||
samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_new : array-like of shape (n_samples, n_components)
|
||||
Spectral embedding of the training matrix.
|
||||
"""
|
||||
self.fit(X)
|
||||
return self.embedding_
|
||||
1214
.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py
Normal file
1214
.venv/lib/python3.12/site-packages/sklearn/manifold/_t_sne.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
120
.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
Normal file
120
.venv/lib/python3.12/site-packages/sklearn/manifold/_utils.pyx
Normal file
@@ -0,0 +1,120 @@
|
||||
import numpy as np
|
||||
|
||||
from libc cimport math
|
||||
from libc.math cimport INFINITY
|
||||
|
||||
from ..utils._typedefs cimport float32_t, float64_t
|
||||
|
||||
|
||||
cdef float EPSILON_DBL = 1e-8
|
||||
cdef float PERPLEXITY_TOLERANCE = 1e-5
|
||||
|
||||
|
||||
# TODO: have this function support float32 and float64 and preserve inputs' dtypes.
|
||||
def _binary_search_perplexity(
|
||||
const float32_t[:, :] sqdistances,
|
||||
float desired_perplexity,
|
||||
int verbose):
|
||||
"""Binary search for sigmas of conditional Gaussians.
|
||||
|
||||
This approximation reduces the computational complexity from O(N^2) to
|
||||
O(uN).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sqdistances : ndarray of shape (n_samples, n_neighbors), dtype=np.float32
|
||||
Distances between training samples and their k nearest neighbors.
|
||||
When using the exact method, this is a square (n_samples, n_samples)
|
||||
distance matrix. The TSNE default metric is "euclidean" which is
|
||||
interpreted as squared euclidean distance.
|
||||
|
||||
desired_perplexity : float
|
||||
Desired perplexity (2^entropy) of the conditional Gaussians.
|
||||
|
||||
verbose : int
|
||||
Verbosity level.
|
||||
|
||||
Returns
|
||||
-------
|
||||
P : ndarray of shape (n_samples, n_samples), dtype=np.float64
|
||||
Probabilities of conditional Gaussian distributions p_i|j.
|
||||
"""
|
||||
# Maximum number of binary search steps
|
||||
cdef long n_steps = 100
|
||||
|
||||
cdef long n_samples = sqdistances.shape[0]
|
||||
cdef long n_neighbors = sqdistances.shape[1]
|
||||
cdef int using_neighbors = n_neighbors < n_samples
|
||||
# Precisions of conditional Gaussian distributions
|
||||
cdef double beta
|
||||
cdef double beta_min
|
||||
cdef double beta_max
|
||||
cdef double beta_sum = 0.0
|
||||
|
||||
# Use log scale
|
||||
cdef double desired_entropy = math.log(desired_perplexity)
|
||||
cdef double entropy_diff
|
||||
|
||||
cdef double entropy
|
||||
cdef double sum_Pi
|
||||
cdef double sum_disti_Pi
|
||||
cdef long i, j, l
|
||||
|
||||
# This array is later used as a 32bit array. It has multiple intermediate
|
||||
# floating point additions that benefit from the extra precision
|
||||
cdef float64_t[:, :] P = np.zeros(
|
||||
(n_samples, n_neighbors), dtype=np.float64)
|
||||
|
||||
for i in range(n_samples):
|
||||
beta_min = -INFINITY
|
||||
beta_max = INFINITY
|
||||
beta = 1.0
|
||||
|
||||
# Binary search of precision for i-th conditional distribution
|
||||
for l in range(n_steps):
|
||||
# Compute current entropy and corresponding probabilities
|
||||
# computed just over the nearest neighbors or over all data
|
||||
# if we're not using neighbors
|
||||
sum_Pi = 0.0
|
||||
for j in range(n_neighbors):
|
||||
if j != i or using_neighbors:
|
||||
P[i, j] = math.exp(-sqdistances[i, j] * beta)
|
||||
sum_Pi += P[i, j]
|
||||
|
||||
if sum_Pi == 0.0:
|
||||
sum_Pi = EPSILON_DBL
|
||||
sum_disti_Pi = 0.0
|
||||
|
||||
for j in range(n_neighbors):
|
||||
P[i, j] /= sum_Pi
|
||||
sum_disti_Pi += sqdistances[i, j] * P[i, j]
|
||||
|
||||
entropy = math.log(sum_Pi) + beta * sum_disti_Pi
|
||||
entropy_diff = entropy - desired_entropy
|
||||
|
||||
if math.fabs(entropy_diff) <= PERPLEXITY_TOLERANCE:
|
||||
break
|
||||
|
||||
if entropy_diff > 0.0:
|
||||
beta_min = beta
|
||||
if beta_max == INFINITY:
|
||||
beta *= 2.0
|
||||
else:
|
||||
beta = (beta + beta_max) / 2.0
|
||||
else:
|
||||
beta_max = beta
|
||||
if beta_min == -INFINITY:
|
||||
beta /= 2.0
|
||||
else:
|
||||
beta = (beta + beta_min) / 2.0
|
||||
|
||||
beta_sum += beta
|
||||
|
||||
if verbose and ((i + 1) % 1000 == 0 or i + 1 == n_samples):
|
||||
print("[t-SNE] Computed conditional probabilities for sample "
|
||||
"%d / %d" % (i + 1, n_samples))
|
||||
|
||||
if verbose:
|
||||
print("[t-SNE] Mean sigma: %f"
|
||||
% np.mean(math.sqrt(n_samples / beta_sum)))
|
||||
return np.asarray(P)
|
||||
@@ -0,0 +1,16 @@
|
||||
py.extension_module(
|
||||
'_utils',
|
||||
['_utils.pyx', utils_cython_tree],
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/manifold',
|
||||
install: true
|
||||
)
|
||||
|
||||
py.extension_module(
|
||||
'_barnes_hut_tsne',
|
||||
'_barnes_hut_tsne.pyx',
|
||||
dependencies: [np_dep, openmp_dep],
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/manifold',
|
||||
install: true
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,348 @@
|
||||
import math
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.sparse import rand as sparse_rand
|
||||
|
||||
from sklearn import clone, datasets, manifold, neighbors, pipeline, preprocessing
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.metrics.pairwise import pairwise_distances
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_allclose_dense_sparse,
|
||||
assert_array_equal,
|
||||
)
|
||||
from sklearn.utils.fixes import CSR_CONTAINERS
|
||||
|
||||
eigen_solvers = ["auto", "dense", "arpack"]
|
||||
path_methods = ["auto", "FW", "D"]
|
||||
|
||||
|
||||
def create_sample_data(dtype, n_pts=25, add_noise=False):
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
n_per_side = int(math.sqrt(n_pts))
|
||||
X = np.array(list(product(range(n_per_side), repeat=2))).astype(dtype, copy=False)
|
||||
if add_noise:
|
||||
# add noise in a third dimension
|
||||
rng = np.random.RandomState(0)
|
||||
noise = 0.1 * rng.randn(n_pts, 1).astype(dtype, copy=False)
|
||||
X = np.concatenate((X, noise), 1)
|
||||
return X
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_simple_grid(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
# Isomap should preserve distances when all neighbors are used
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=False)
|
||||
|
||||
# distances from each point to all others
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance")
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance")
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose_dense_sparse(G, G_iso, atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(24, None), (None, np.inf)])
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
def test_isomap_reconstruction_error(
|
||||
global_dtype, n_neighbors, radius, eigen_solver, path_method
|
||||
):
|
||||
if global_dtype is np.float32:
|
||||
pytest.skip(
|
||||
"Skipping test due to numerical instabilities on float32 data"
|
||||
"from KernelCenterer used in the reconstruction_error method"
|
||||
)
|
||||
|
||||
# Same setup as in test_isomap_simple_grid, with an added dimension
|
||||
n_pts = 25
|
||||
X = create_sample_data(global_dtype, n_pts=n_pts, add_noise=True)
|
||||
|
||||
# compute input kernel
|
||||
if n_neighbors is not None:
|
||||
G = neighbors.kneighbors_graph(X, n_neighbors, mode="distance").toarray()
|
||||
else:
|
||||
G = neighbors.radius_neighbors_graph(X, radius, mode="distance").toarray()
|
||||
centerer = preprocessing.KernelCenterer()
|
||||
K = centerer.fit_transform(-0.5 * G**2)
|
||||
|
||||
clf = manifold.Isomap(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
)
|
||||
clf.fit(X)
|
||||
|
||||
# compute output kernel
|
||||
if n_neighbors is not None:
|
||||
G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode="distance")
|
||||
else:
|
||||
G_iso = neighbors.radius_neighbors_graph(
|
||||
clf.embedding_, radius, mode="distance"
|
||||
)
|
||||
G_iso = G_iso.toarray()
|
||||
K_iso = centerer.fit_transform(-0.5 * G_iso**2)
|
||||
|
||||
# make sure error agrees
|
||||
reconstruction_error = np.linalg.norm(K - K_iso) / n_pts
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(reconstruction_error, clf.reconstruction_error(), atol=atol)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 0.5)])
|
||||
def test_transform(global_dtype, n_neighbors, radius):
|
||||
n_samples = 200
|
||||
n_components = 10
|
||||
noise_scale = 0.01
|
||||
|
||||
# Create S-curve dataset
|
||||
X, y = datasets.make_s_curve(n_samples, random_state=0)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
# Compute isomap embedding
|
||||
iso = manifold.Isomap(
|
||||
n_components=n_components, n_neighbors=n_neighbors, radius=radius
|
||||
)
|
||||
X_iso = iso.fit_transform(X)
|
||||
|
||||
# Re-embed a noisy version of the points
|
||||
rng = np.random.RandomState(0)
|
||||
noise = noise_scale * rng.randn(*X.shape)
|
||||
X_iso2 = iso.transform(X + noise)
|
||||
|
||||
# Make sure the rms error on re-embedding is comparable to noise_scale
|
||||
assert np.sqrt(np.mean((X_iso - X_iso2) ** 2)) < 2 * noise_scale
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_neighbors, radius", [(2, None), (None, 10.0)])
|
||||
def test_pipeline(n_neighbors, radius, global_dtype):
|
||||
# check that Isomap works fine as a transformer in a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("isomap", manifold.Isomap(n_neighbors=n_neighbors, radius=radius)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
def test_pipeline_with_nearest_neighbors_transformer(global_dtype):
|
||||
# Test chaining NearestNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = "auto"
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X2, _ = datasets.make_blobs(random_state=1)
|
||||
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
X2 = X2.astype(global_dtype, copy=False)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = pipeline.make_pipeline(
|
||||
neighbors.KNeighborsTransformer(
|
||||
n_neighbors=n_neighbors, algorithm=algorithm, mode="distance"
|
||||
),
|
||||
manifold.Isomap(n_neighbors=n_neighbors, metric="precomputed"),
|
||||
)
|
||||
est_compact = manifold.Isomap(
|
||||
n_neighbors=n_neighbors, neighbors_algorithm=algorithm
|
||||
)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_allclose(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"metric, p, is_euclidean",
|
||||
[
|
||||
("euclidean", 2, True),
|
||||
("manhattan", 1, False),
|
||||
("minkowski", 1, False),
|
||||
("minkowski", 2, True),
|
||||
(lambda x1, x2: np.sqrt(np.sum(x1**2 + x2**2)), 2, False),
|
||||
],
|
||||
)
|
||||
def test_different_metric(global_dtype, metric, p, is_euclidean):
|
||||
# Isomap must work on various metric parameters work correctly
|
||||
# and must default to euclidean.
|
||||
X, _ = datasets.make_blobs(random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
reference = manifold.Isomap().fit_transform(X)
|
||||
embedding = manifold.Isomap(metric=metric, p=p).fit_transform(X)
|
||||
|
||||
if is_euclidean:
|
||||
assert_allclose(embedding, reference)
|
||||
else:
|
||||
with pytest.raises(AssertionError, match="Not equal to tolerance"):
|
||||
assert_allclose(embedding, reference)
|
||||
|
||||
|
||||
def test_isomap_clone_bug():
|
||||
# regression test for bug reported in #6062
|
||||
model = manifold.Isomap()
|
||||
for n_neighbors in [10, 15, 20]:
|
||||
model.set_params(n_neighbors=n_neighbors)
|
||||
model.fit(np.random.rand(50, 2))
|
||||
assert model.nbrs_.n_neighbors == n_neighbors
|
||||
|
||||
|
||||
@pytest.mark.parametrize("eigen_solver", eigen_solvers)
|
||||
@pytest.mark.parametrize("path_method", path_methods)
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_sparse_input(
|
||||
global_dtype, eigen_solver, path_method, global_random_seed, csr_container
|
||||
):
|
||||
# TODO: compare results on dense and sparse data as proposed in:
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/23585#discussion_r968388186
|
||||
X = csr_container(
|
||||
sparse_rand(
|
||||
100,
|
||||
3,
|
||||
density=0.1,
|
||||
format="csr",
|
||||
dtype=global_dtype,
|
||||
random_state=global_random_seed,
|
||||
)
|
||||
)
|
||||
|
||||
iso_dense = manifold.Isomap(
|
||||
n_components=2,
|
||||
eigen_solver=eigen_solver,
|
||||
path_method=path_method,
|
||||
n_neighbors=8,
|
||||
)
|
||||
iso_sparse = clone(iso_dense)
|
||||
|
||||
X_trans_dense = iso_dense.fit_transform(X.toarray())
|
||||
X_trans_sparse = iso_sparse.fit_transform(X)
|
||||
|
||||
assert_allclose(X_trans_sparse, X_trans_dense, rtol=1e-4, atol=1e-4)
|
||||
|
||||
|
||||
def test_isomap_fit_precomputed_radius_graph(global_dtype):
|
||||
# Isomap.fit_transform must yield similar result when using
|
||||
# a precomputed distance matrix.
|
||||
|
||||
X, y = datasets.make_s_curve(200, random_state=0)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
radius = 10
|
||||
|
||||
g = neighbors.radius_neighbors_graph(X, radius=radius, mode="distance")
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="precomputed")
|
||||
isomap.fit(g)
|
||||
precomputed_result = isomap.embedding_
|
||||
|
||||
isomap = manifold.Isomap(n_neighbors=None, radius=radius, metric="minkowski")
|
||||
result = isomap.fit_transform(X)
|
||||
atol = 1e-5 if global_dtype == np.float32 else 0
|
||||
assert_allclose(precomputed_result, result, atol=atol)
|
||||
|
||||
|
||||
def test_isomap_fitted_attributes_dtype(global_dtype):
|
||||
"""Check that the fitted attributes are stored accordingly to the
|
||||
data type of X."""
|
||||
iso = manifold.Isomap(n_neighbors=2)
|
||||
|
||||
X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
|
||||
|
||||
iso.fit(X)
|
||||
|
||||
assert iso.dist_matrix_.dtype == global_dtype
|
||||
assert iso.embedding_.dtype == global_dtype
|
||||
|
||||
|
||||
def test_isomap_dtype_equivalence():
|
||||
"""Check the equivalence of the results with 32 and 64 bits input."""
|
||||
iso_32 = manifold.Isomap(n_neighbors=2)
|
||||
X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
|
||||
iso_32.fit(X_32)
|
||||
|
||||
iso_64 = manifold.Isomap(n_neighbors=2)
|
||||
X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
|
||||
iso_64.fit(X_64)
|
||||
|
||||
assert_allclose(iso_32.dist_matrix_, iso_64.dist_matrix_)
|
||||
|
||||
|
||||
def test_isomap_raise_error_when_neighbor_and_radius_both_set():
|
||||
# Isomap.fit_transform must raise a ValueError if
|
||||
# radius and n_neighbors are provided.
|
||||
|
||||
X, _ = datasets.load_digits(return_X_y=True)
|
||||
isomap = manifold.Isomap(n_neighbors=3, radius=5.5)
|
||||
msg = "Both n_neighbors and radius are provided"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
isomap.fit_transform(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components():
|
||||
# Test that a warning is raised when the graph has multiple components
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None]
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=2).fit(X)
|
||||
|
||||
|
||||
def test_multiple_connected_components_metric_precomputed(global_dtype):
|
||||
# Test that an error is raised when the graph has multiple components
|
||||
# and when X is a precomputed neighbors graph.
|
||||
X = np.array([0, 1, 2, 5, 6, 7])[:, None].astype(global_dtype, copy=False)
|
||||
|
||||
# works with a precomputed distance matrix (dense)
|
||||
X_distances = pairwise_distances(X)
|
||||
with pytest.warns(UserWarning, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_distances)
|
||||
|
||||
# does not work with a precomputed neighbors graph (sparse)
|
||||
X_graph = neighbors.kneighbors_graph(X, n_neighbors=2, mode="distance")
|
||||
with pytest.raises(RuntimeError, match="number of connected components"):
|
||||
manifold.Isomap(n_neighbors=1, metric="precomputed").fit(X_graph)
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for Isomap."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.Isomap(n_components=n_components)
|
||||
iso.fit_transform(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal([f"isomap{i}" for i in range(n_components)], names)
|
||||
@@ -0,0 +1,171 @@
|
||||
from itertools import product
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn import manifold, neighbors
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold._locally_linear import barycenter_kneighbors_graph
|
||||
from sklearn.utils._testing import (
|
||||
assert_allclose,
|
||||
assert_array_equal,
|
||||
ignore_warnings,
|
||||
)
|
||||
|
||||
eigen_solvers = ["dense", "arpack"]
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test utility routines
|
||||
def test_barycenter_kneighbors_graph(global_dtype):
|
||||
X = np.array([[0, 1], [1.01, 1.0], [2, 0]], dtype=global_dtype)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 1)
|
||||
expected_graph = np.array(
|
||||
[[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype=global_dtype
|
||||
)
|
||||
|
||||
assert graph.dtype == global_dtype
|
||||
|
||||
assert_allclose(graph.toarray(), expected_graph)
|
||||
|
||||
graph = barycenter_kneighbors_graph(X, 2)
|
||||
# check that columns sum to one
|
||||
assert_allclose(np.sum(graph.toarray(), axis=1), np.ones(3))
|
||||
pred = np.dot(graph.toarray(), X)
|
||||
assert linalg.norm(pred - X) / X.shape[0] < 1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Test LLE by computing the reconstruction error on some manifolds.
|
||||
|
||||
|
||||
def test_lle_simple_grid(global_dtype):
|
||||
# note: ARPACK is numerically unstable, so this test will fail for
|
||||
# some random seeds. We choose 42 because the tests pass.
|
||||
# for arm64 platforms 2 makes the test fail.
|
||||
# TODO: rewrite this test to make less sensitive to the random seed,
|
||||
# irrespective of the platform.
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# grid of equidistant points in 2D, n_components = n_dim
|
||||
X = np.array(list(product(range(5), repeat=2)))
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
|
||||
n_components = 2
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=5, n_components=n_components, random_state=rng
|
||||
)
|
||||
tol = 0.1
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X, "fro")
|
||||
assert reconstruction_error < tol
|
||||
|
||||
for solver in eigen_solvers:
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
|
||||
assert reconstruction_error < tol
|
||||
assert_allclose(clf.reconstruction_error_, reconstruction_error, atol=1e-1)
|
||||
|
||||
# re-embed a noisy version of X using the transform method
|
||||
noise = rng.randn(*X.shape).astype(global_dtype, copy=False) / 100
|
||||
X_reembedded = clf.transform(X + noise)
|
||||
assert linalg.norm(X_reembedded - clf.embedding_) < tol
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["standard", "hessian", "modified", "ltsa"])
|
||||
@pytest.mark.parametrize("solver", eigen_solvers)
|
||||
def test_lle_manifold(global_dtype, method, solver):
|
||||
rng = np.random.RandomState(0)
|
||||
# similar test on a slightly more complex manifold
|
||||
X = np.array(list(product(np.arange(18), repeat=2)))
|
||||
X = np.c_[X, X[:, 0] ** 2 / 18]
|
||||
X = X + 1e-10 * rng.uniform(size=X.shape)
|
||||
X = X.astype(global_dtype, copy=False)
|
||||
n_components = 2
|
||||
|
||||
clf = manifold.LocallyLinearEmbedding(
|
||||
n_neighbors=6, n_components=n_components, method=method, random_state=0
|
||||
)
|
||||
tol = 1.5 if method == "standard" else 3
|
||||
|
||||
N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray()
|
||||
reconstruction_error = linalg.norm(np.dot(N, X) - X)
|
||||
assert reconstruction_error < tol
|
||||
|
||||
clf.set_params(eigen_solver=solver)
|
||||
clf.fit(X)
|
||||
assert clf.embedding_.shape[1] == n_components
|
||||
reconstruction_error = (
|
||||
linalg.norm(np.dot(N, clf.embedding_) - clf.embedding_, "fro") ** 2
|
||||
)
|
||||
details = "solver: %s, method: %s" % (solver, method)
|
||||
assert reconstruction_error < tol, details
|
||||
assert (
|
||||
np.abs(clf.reconstruction_error_ - reconstruction_error)
|
||||
< tol * reconstruction_error
|
||||
), details
|
||||
|
||||
|
||||
def test_pipeline():
|
||||
# check that LocallyLinearEmbedding works fine as a Pipeline
|
||||
# only checks that no error is raised.
|
||||
# TODO check that it actually does something useful
|
||||
from sklearn import datasets, pipeline
|
||||
|
||||
X, y = datasets.make_blobs(random_state=0)
|
||||
clf = pipeline.Pipeline(
|
||||
[
|
||||
("filter", manifold.LocallyLinearEmbedding(random_state=0)),
|
||||
("clf", neighbors.KNeighborsClassifier()),
|
||||
]
|
||||
)
|
||||
clf.fit(X, y)
|
||||
assert 0.9 < clf.score(X, y)
|
||||
|
||||
|
||||
# Test the error raised when the weight matrix is singular
|
||||
def test_singular_matrix():
|
||||
M = np.ones((200, 3))
|
||||
f = ignore_warnings
|
||||
with pytest.raises(ValueError, match="Error in determining null-space with ARPACK"):
|
||||
f(
|
||||
manifold.locally_linear_embedding(
|
||||
M,
|
||||
n_neighbors=2,
|
||||
n_components=1,
|
||||
method="standard",
|
||||
eigen_solver="arpack",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# regression test for #6033
|
||||
def test_integer_input():
|
||||
rand = np.random.RandomState(0)
|
||||
X = rand.randint(0, 100, size=(20, 3))
|
||||
|
||||
for method in ["standard", "hessian", "modified", "ltsa"]:
|
||||
clf = manifold.LocallyLinearEmbedding(method=method, n_neighbors=10)
|
||||
clf.fit(X) # this previously raised a TypeError
|
||||
|
||||
|
||||
def test_get_feature_names_out():
|
||||
"""Check get_feature_names_out for LocallyLinearEmbedding."""
|
||||
X, y = make_blobs(random_state=0, n_features=4)
|
||||
n_components = 2
|
||||
|
||||
iso = manifold.LocallyLinearEmbedding(n_components=n_components)
|
||||
iso.fit(X)
|
||||
names = iso.get_feature_names_out()
|
||||
assert_array_equal(
|
||||
[f"locallylinearembedding{i}" for i in range(n_components)], names
|
||||
)
|
||||
@@ -0,0 +1,87 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_almost_equal
|
||||
|
||||
from sklearn.manifold import _mds as mds
|
||||
from sklearn.metrics import euclidean_distances
|
||||
|
||||
|
||||
def test_smacof():
|
||||
# test metric smacof using the data of "Modern Multidimensional Scaling",
|
||||
# Borg & Groenen, p 154
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
Z = np.array([[-0.266, -0.539], [0.451, 0.252], [0.016, -0.238], [-0.200, 0.524]])
|
||||
X, _ = mds.smacof(sim, init=Z, n_components=2, max_iter=1, n_init=1)
|
||||
X_true = np.array(
|
||||
[[-1.415, -2.471], [1.633, 1.107], [0.249, -0.067], [-0.468, 1.431]]
|
||||
)
|
||||
assert_array_almost_equal(X, X_true, decimal=3)
|
||||
|
||||
|
||||
def test_smacof_error():
|
||||
# Not symmetric similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# Not squared similarity matrix:
|
||||
sim = np.array([[0, 5, 9, 4], [5, 0, 2, 2], [4, 2, 1, 0]])
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim)
|
||||
|
||||
# init not None and not correct format:
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
Z = np.array([[-0.266, -0.539], [0.016, -0.238], [-0.200, 0.524]])
|
||||
with pytest.raises(ValueError):
|
||||
mds.smacof(sim, init=Z, n_init=1)
|
||||
|
||||
|
||||
def test_MDS():
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
mds_clf = mds.MDS(metric=False, n_jobs=3, dissimilarity="precomputed")
|
||||
mds_clf.fit(sim)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("k", [0.5, 1.5, 2])
|
||||
def test_normed_stress(k):
|
||||
"""Test that non-metric MDS normalized stress is scale-invariant."""
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
|
||||
X1, stress1 = mds.smacof(sim, metric=False, max_iter=5, random_state=0)
|
||||
X2, stress2 = mds.smacof(k * sim, metric=False, max_iter=5, random_state=0)
|
||||
|
||||
assert_allclose(stress1, stress2, rtol=1e-5)
|
||||
assert_allclose(X1, X2, rtol=1e-5)
|
||||
|
||||
|
||||
def test_normalize_metric_warning():
|
||||
"""
|
||||
Test that a UserWarning is emitted when using normalized stress with
|
||||
metric-MDS.
|
||||
"""
|
||||
msg = "Normalized stress is not supported"
|
||||
sim = np.array([[0, 5, 3, 4], [5, 0, 2, 2], [3, 2, 0, 1], [4, 2, 1, 0]])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
mds.smacof(sim, metric=True, normalized_stress=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("metric", [True, False])
|
||||
def test_normalized_stress_auto(metric, monkeypatch):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(4, 3)
|
||||
dist = euclidean_distances(X)
|
||||
|
||||
mock = Mock(side_effect=mds._smacof_single)
|
||||
monkeypatch.setattr("sklearn.manifold._mds._smacof_single", mock)
|
||||
|
||||
est = mds.MDS(metric=metric, normalized_stress="auto", random_state=rng)
|
||||
est.fit_transform(X)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
|
||||
mds.smacof(dist, metric=metric, normalized_stress="auto", random_state=rng)
|
||||
assert mock.call_args[1]["normalized_stress"] != metric
|
||||
@@ -0,0 +1,541 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy import sparse
|
||||
from scipy.linalg import eigh
|
||||
from scipy.sparse.linalg import eigsh, lobpcg
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.manifold import SpectralEmbedding, _spectral_embedding, spectral_embedding
|
||||
from sklearn.manifold._spectral_embedding import (
|
||||
_graph_connected_component,
|
||||
_graph_is_connected,
|
||||
)
|
||||
from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
|
||||
from sklearn.utils.extmath import _deterministic_vector_sign_flip
|
||||
from sklearn.utils.fixes import (
|
||||
COO_CONTAINERS,
|
||||
CSC_CONTAINERS,
|
||||
CSR_CONTAINERS,
|
||||
parse_version,
|
||||
sp_version,
|
||||
)
|
||||
from sklearn.utils.fixes import laplacian as csgraph_laplacian
|
||||
|
||||
try:
|
||||
from pyamg import smoothed_aggregation_solver # noqa
|
||||
|
||||
pyamg_available = True
|
||||
except ImportError:
|
||||
pyamg_available = False
|
||||
skip_if_no_pyamg = pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
|
||||
# non centered, sparse centers to check the
|
||||
centers = np.array(
|
||||
[
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
]
|
||||
)
|
||||
n_samples = 1000
|
||||
n_clusters, n_features = centers.shape
|
||||
S, true_labels = make_blobs(
|
||||
n_samples=n_samples, centers=centers, cluster_std=1.0, random_state=42
|
||||
)
|
||||
|
||||
|
||||
def _assert_equal_with_sign_flipping(A, B, tol=0.0):
|
||||
"""Check array A and B are equal with possible sign flipping on
|
||||
each columns"""
|
||||
tol_squared = tol**2
|
||||
for A_col, B_col in zip(A.T, B.T):
|
||||
assert (
|
||||
np.max((A_col - B_col) ** 2) <= tol_squared
|
||||
or np.max((A_col + B_col) ** 2) <= tol_squared
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_sparse_graph_connected_component(coo_container):
|
||||
rng = np.random.RandomState(42)
|
||||
n_samples = 300
|
||||
boundaries = [0, 42, 121, 200, n_samples]
|
||||
p = rng.permutation(n_samples)
|
||||
connections = []
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
group = p[start:stop]
|
||||
# Connect all elements within the group at least once via an
|
||||
# arbitrary path that spans the group.
|
||||
for i in range(len(group) - 1):
|
||||
connections.append((group[i], group[i + 1]))
|
||||
|
||||
# Add some more random connections within the group
|
||||
min_idx, max_idx = 0, len(group) - 1
|
||||
n_random_connections = 1000
|
||||
source = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
target = rng.randint(min_idx, max_idx, size=n_random_connections)
|
||||
connections.extend(zip(group[source], group[target]))
|
||||
|
||||
# Build a symmetric affinity matrix
|
||||
row_idx, column_idx = tuple(np.array(connections).T)
|
||||
data = rng.uniform(0.1, 42, size=len(connections))
|
||||
affinity = coo_container((data, (row_idx, column_idx)))
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
for start, stop in zip(boundaries[:-1], boundaries[1:]):
|
||||
component_1 = _graph_connected_component(affinity, p[start])
|
||||
component_size = stop - start
|
||||
assert component_1.sum() == component_size
|
||||
|
||||
# We should retrieve the same component mask by starting by both ends
|
||||
# of the group
|
||||
component_2 = _graph_connected_component(affinity, p[stop - 1])
|
||||
assert component_2.sum() == component_size
|
||||
assert_array_equal(component_1, component_2)
|
||||
|
||||
|
||||
# TODO: investigate why this test is seed-sensitive on 32-bit Python
|
||||
# runtimes. Is this revealing a numerical stability problem ? Or is it
|
||||
# expected from the test numerical design ? In the latter case the test
|
||||
# should be made less seed-sensitive instead.
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_two_components(eigen_solver, dtype, seed=0):
|
||||
# Test spectral embedding with two components
|
||||
random_state = np.random.RandomState(seed)
|
||||
n_sample = 100
|
||||
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
|
||||
# first component
|
||||
affinity[0:n_sample, 0:n_sample] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
# second component
|
||||
affinity[n_sample::, n_sample::] = (
|
||||
np.abs(random_state.randn(n_sample, n_sample)) + 2
|
||||
)
|
||||
|
||||
# Test of internal _graph_connected_component before connection
|
||||
component = _graph_connected_component(affinity, 0)
|
||||
assert component[:n_sample].all()
|
||||
assert not component[n_sample:].any()
|
||||
component = _graph_connected_component(affinity, -1)
|
||||
assert not component[:n_sample].any()
|
||||
assert component[n_sample:].all()
|
||||
|
||||
# connection
|
||||
affinity[0, n_sample + 1] = 1
|
||||
affinity[n_sample + 1, 0] = 1
|
||||
affinity.flat[:: 2 * n_sample + 1] = 0
|
||||
affinity = 0.5 * (affinity + affinity.T)
|
||||
|
||||
true_label = np.zeros(shape=2 * n_sample)
|
||||
true_label[0:n_sample] = 1
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=1,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
|
||||
embedded_coordinate = se_precomp.fit_transform(affinity.astype(dtype))
|
||||
# thresholding on the first components using 0.
|
||||
label_ = np.array(embedded_coordinate.ravel() < 0, dtype=np.int64)
|
||||
assert normalized_mutual_info_score(true_label, label_) == pytest.approx(1.0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_precomputed_affinity(
|
||||
sparse_container, eigen_solver, dtype, seed=36
|
||||
):
|
||||
# Test spectral embedding with precomputed kernel
|
||||
gamma = 1.0
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="precomputed",
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
eigen_solver=eigen_solver,
|
||||
)
|
||||
embed_precomp = se_precomp.fit_transform(rbf_kernel(X.astype(dtype), gamma=gamma))
|
||||
embed_rbf = se_rbf.fit_transform(X.astype(dtype))
|
||||
assert_array_almost_equal(se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_precomp, embed_rbf, 0.05)
|
||||
|
||||
|
||||
def test_precomputed_nearest_neighbors_filtering():
|
||||
# Test precomputed graph filtering when containing too many neighbors
|
||||
n_neighbors = 2
|
||||
results = []
|
||||
for additional_neighbors in [0, 10]:
|
||||
nn = NearestNeighbors(n_neighbors=n_neighbors + additional_neighbors).fit(S)
|
||||
graph = nn.kneighbors_graph(S, mode="connectivity")
|
||||
embedding = (
|
||||
SpectralEmbedding(
|
||||
random_state=0,
|
||||
n_components=2,
|
||||
affinity="precomputed_nearest_neighbors",
|
||||
n_neighbors=n_neighbors,
|
||||
)
|
||||
.fit(graph)
|
||||
.embedding_
|
||||
)
|
||||
results.append(embedding)
|
||||
|
||||
assert_array_equal(results[0], results[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sparse_container", [None, *CSR_CONTAINERS])
|
||||
def test_spectral_embedding_callable_affinity(sparse_container, seed=36):
|
||||
# Test spectral embedding with callable affinity
|
||||
gamma = 0.9
|
||||
kern = rbf_kernel(S, gamma=gamma)
|
||||
X = S if sparse_container is None else sparse_container(S)
|
||||
|
||||
se_callable = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity=(lambda x: rbf_kernel(x, gamma=gamma)),
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
gamma=gamma,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_rbf = se_rbf.fit_transform(X)
|
||||
embed_callable = se_callable.fit_transform(X)
|
||||
assert_array_almost_equal(se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
|
||||
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
|
||||
_assert_equal_with_sign_flipping(embed_rbf, embed_callable, 0.05)
|
||||
|
||||
|
||||
# TODO: Remove when pyamg does replaces sp.rand call with np.random.rand
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.float
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of pinv2
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
|
||||
def test_spectral_embedding_amg_solver(dtype, coo_container, seed=36):
|
||||
se_amg = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="amg",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
se_arpack = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="nearest_neighbors",
|
||||
eigen_solver="arpack",
|
||||
n_neighbors=5,
|
||||
random_state=np.random.RandomState(seed),
|
||||
)
|
||||
embed_amg = se_amg.fit_transform(S.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(S.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# same with special case in which amg is not actually used
|
||||
# regression test for #10715
|
||||
# affinity between nodes
|
||||
row = np.array([0, 0, 1, 2, 3, 3, 4], dtype=np.int32)
|
||||
col = np.array([1, 2, 2, 3, 4, 5, 5], dtype=np.int32)
|
||||
val = np.array([100, 100, 100, 1, 100, 100, 100], dtype=np.int64)
|
||||
|
||||
affinity = coo_container(
|
||||
(np.hstack([val, val]), (np.hstack([row, col]), np.hstack([col, row]))),
|
||||
shape=(6, 6),
|
||||
)
|
||||
se_amg.affinity = "precomputed"
|
||||
se_arpack.affinity = "precomputed"
|
||||
embed_amg = se_amg.fit_transform(affinity.astype(dtype))
|
||||
embed_arpack = se_arpack.fit_transform(affinity.astype(dtype))
|
||||
_assert_equal_with_sign_flipping(embed_amg, embed_arpack, 1e-5)
|
||||
|
||||
# Check that passing a sparse matrix with `np.int64` indices dtype raises an error
|
||||
# or is successful based on the version of SciPy which is installed.
|
||||
# Use a CSR matrix to avoid any conversion during the validation
|
||||
affinity = affinity.tocsr()
|
||||
affinity.indptr = affinity.indptr.astype(np.int64)
|
||||
affinity.indices = affinity.indices.astype(np.int64)
|
||||
|
||||
# PR: https://github.com/scipy/scipy/pull/18913
|
||||
# First integration in 1.11.3: https://github.com/scipy/scipy/pull/19279
|
||||
scipy_graph_traversal_supports_int64_index = sp_version >= parse_version("1.11.3")
|
||||
if scipy_graph_traversal_supports_int64_index:
|
||||
se_amg.fit_transform(affinity)
|
||||
else:
|
||||
err_msg = "Only sparse matrices with 32-bit integer indices are accepted"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_amg.fit_transform(affinity)
|
||||
|
||||
|
||||
# TODO: Remove filterwarnings when pyamg does replaces sp.rand call with
|
||||
# np.random.rand:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/15913
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.rand is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.float
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`np.float` is a deprecated alias:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of pinv2
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:scipy.linalg.pinv2 is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.skipif(
|
||||
not pyamg_available, reason="PyAMG is required for the tests in this function."
|
||||
)
|
||||
# TODO: Remove when pyamg removes the use of np.find_common_type
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", (np.float32, np.float64))
|
||||
def test_spectral_embedding_amg_solver_failure(dtype, seed=36):
|
||||
# Non-regression test for amg solver failure (issue #13393 on github)
|
||||
num_nodes = 100
|
||||
X = sparse.rand(num_nodes, num_nodes, density=0.1, random_state=seed)
|
||||
X = X.astype(dtype)
|
||||
upper = sparse.triu(X) - sparse.diags(X.diagonal())
|
||||
sym_matrix = upper + upper.T
|
||||
embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=0
|
||||
)
|
||||
|
||||
# Check that the learned embedding is stable w.r.t. random solver init:
|
||||
for i in range(3):
|
||||
new_embedding = spectral_embedding(
|
||||
sym_matrix, n_components=10, eigen_solver="amg", random_state=i + 1
|
||||
)
|
||||
_assert_equal_with_sign_flipping(embedding, new_embedding, tol=0.05)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:the behavior of nmi will change in version 0.22")
|
||||
def test_pipeline_spectral_clustering(seed=36):
|
||||
# Test using pipeline to do spectral clustering
|
||||
random_state = np.random.RandomState(seed)
|
||||
se_rbf = SpectralEmbedding(
|
||||
n_components=n_clusters, affinity="rbf", random_state=random_state
|
||||
)
|
||||
se_knn = SpectralEmbedding(
|
||||
n_components=n_clusters,
|
||||
affinity="nearest_neighbors",
|
||||
n_neighbors=5,
|
||||
random_state=random_state,
|
||||
)
|
||||
for se in [se_rbf, se_knn]:
|
||||
km = KMeans(n_clusters=n_clusters, random_state=random_state, n_init=10)
|
||||
km.fit(se.fit_transform(S))
|
||||
assert_array_almost_equal(
|
||||
normalized_mutual_info_score(km.labels_, true_labels), 1.0, 2
|
||||
)
|
||||
|
||||
|
||||
def test_connectivity(seed=36):
|
||||
# Test that graph connectivity test works as expected
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 0, 0, 0, 0],
|
||||
[0, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert not _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert not _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert not _graph_is_connected(csc_container(graph))
|
||||
|
||||
graph = np.array(
|
||||
[
|
||||
[1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 0, 0],
|
||||
[0, 1, 1, 1, 0],
|
||||
[0, 0, 1, 1, 1],
|
||||
[0, 0, 0, 1, 1],
|
||||
]
|
||||
)
|
||||
assert _graph_is_connected(graph)
|
||||
for csr_container in CSR_CONTAINERS:
|
||||
assert _graph_is_connected(csr_container(graph))
|
||||
for csc_container in CSC_CONTAINERS:
|
||||
assert _graph_is_connected(csc_container(graph))
|
||||
|
||||
|
||||
def test_spectral_embedding_deterministic():
|
||||
# Test that Spectral Embedding is deterministic
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
embedding_1 = spectral_embedding(sims)
|
||||
embedding_2 = spectral_embedding(sims)
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_unnormalized():
|
||||
# Test that spectral_embedding is also processing unnormalized laplacian
|
||||
# correctly
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 8
|
||||
embedding_1 = spectral_embedding(
|
||||
sims, norm_laplacian=False, n_components=n_components, drop_first=False
|
||||
)
|
||||
|
||||
# Verify using manual computation with dense eigh
|
||||
laplacian, dd = csgraph_laplacian(sims, normed=False, return_diag=True)
|
||||
_, diffusion_map = eigh(laplacian)
|
||||
embedding_2 = diffusion_map.T[:n_components]
|
||||
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
|
||||
|
||||
assert_array_almost_equal(embedding_1, embedding_2)
|
||||
|
||||
|
||||
def test_spectral_embedding_first_eigen_vector():
|
||||
# Test that the first eigenvector of spectral_embedding
|
||||
# is constant and that the second is not (for a connected graph)
|
||||
random_state = np.random.RandomState(36)
|
||||
data = random_state.randn(10, 30)
|
||||
sims = rbf_kernel(data)
|
||||
n_components = 2
|
||||
|
||||
for seed in range(10):
|
||||
embedding = spectral_embedding(
|
||||
sims,
|
||||
norm_laplacian=False,
|
||||
n_components=n_components,
|
||||
drop_first=False,
|
||||
random_state=seed,
|
||||
)
|
||||
|
||||
assert np.std(embedding[:, 0]) == pytest.approx(0)
|
||||
assert np.std(embedding[:, 1]) > 1e-3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"eigen_solver",
|
||||
[
|
||||
"arpack",
|
||||
"lobpcg",
|
||||
pytest.param("amg", marks=skip_if_no_pyamg),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_spectral_embedding_preserves_dtype(eigen_solver, dtype):
|
||||
"""Check that `SpectralEmbedding is preserving the dtype of the fitted
|
||||
attribute and transformed data.
|
||||
|
||||
Ideally, this test should be covered by the common test
|
||||
`check_transformer_preserve_dtypes`. However, this test only run
|
||||
with transformers implementing `transform` while `SpectralEmbedding`
|
||||
implements only `fit_transform`.
|
||||
"""
|
||||
X = S.astype(dtype)
|
||||
se = SpectralEmbedding(
|
||||
n_components=2, affinity="rbf", eigen_solver=eigen_solver, random_state=0
|
||||
)
|
||||
X_trans = se.fit_transform(X)
|
||||
|
||||
assert X_trans.dtype == dtype
|
||||
assert se.embedding_.dtype == dtype
|
||||
assert se.affinity_matrix_.dtype == dtype
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pyamg_available,
|
||||
reason="PyAMG is installed and we should not test for an error.",
|
||||
)
|
||||
def test_error_pyamg_not_available():
|
||||
se_precomp = SpectralEmbedding(
|
||||
n_components=2,
|
||||
affinity="rbf",
|
||||
eigen_solver="amg",
|
||||
)
|
||||
err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
se_precomp.fit_transform(S)
|
||||
|
||||
|
||||
# TODO: Remove when pyamg removes the use of np.find_common_type
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:np.find_common_type is deprecated:DeprecationWarning:pyamg.*"
|
||||
)
|
||||
@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
|
||||
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
|
||||
def test_spectral_eigen_tol_auto(monkeypatch, solver, csr_container):
|
||||
"""Test that `eigen_tol="auto"` is resolved correctly"""
|
||||
if solver == "amg" and not pyamg_available:
|
||||
pytest.skip("PyAMG is not available.")
|
||||
X, _ = make_blobs(
|
||||
n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
|
||||
)
|
||||
D = pairwise_distances(X) # Distance matrix
|
||||
S = np.max(D) - D # Similarity matrix
|
||||
|
||||
solver_func = eigsh if solver == "arpack" else lobpcg
|
||||
default_value = 0 if solver == "arpack" else None
|
||||
if solver == "amg":
|
||||
S = csr_container(S)
|
||||
|
||||
mocked_solver = Mock(side_effect=solver_func)
|
||||
|
||||
monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
|
||||
|
||||
spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
|
||||
mocked_solver.assert_called()
|
||||
|
||||
_, kwargs = mocked_solver.call_args
|
||||
assert kwargs["tol"] == default_value
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user