library packages
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
"""Module to give helpful messages to the user that did not
|
||||
compile scikit-learn properly.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
INPLACE_MSG = """
|
||||
It appears that you are importing a local scikit-learn source tree. For
|
||||
this, you need to have an inplace install. Maybe you are in the source
|
||||
directory and you need to try from another location."""
|
||||
|
||||
STANDARD_MSG = """
|
||||
If you have used an installer, please check that it is suited for your
|
||||
Python version, your operating system and your platform."""
|
||||
|
||||
|
||||
def raise_build_error(e):
|
||||
# Raise a comprehensible error and list the contents of the
|
||||
# directory to help debugging on the mailing list.
|
||||
local_dir = os.path.split(__file__)[0]
|
||||
msg = STANDARD_MSG
|
||||
if local_dir == "sklearn/__check_build":
|
||||
# Picking up the local install: this will work only if the
|
||||
# install is an 'inplace build'
|
||||
msg = INPLACE_MSG
|
||||
dir_content = list()
|
||||
for i, filename in enumerate(os.listdir(local_dir)):
|
||||
if (i + 1) % 3:
|
||||
dir_content.append(filename.ljust(26))
|
||||
else:
|
||||
dir_content.append(filename + "\n")
|
||||
raise ImportError(
|
||||
"""%s
|
||||
___________________________________________________________________________
|
||||
Contents of %s:
|
||||
%s
|
||||
___________________________________________________________________________
|
||||
It seems that scikit-learn has not been built correctly.
|
||||
|
||||
If you have installed scikit-learn from source, please do not forget
|
||||
to build the package before using it: run `python setup.py install` or
|
||||
`make` in the source directory.
|
||||
%s"""
|
||||
% (e, local_dir, "".join(dir_content).strip(), msg)
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
from ._check_build import check_build # noqa
|
||||
except ImportError as e:
|
||||
raise_build_error(e)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,2 @@
|
||||
def check_build():
|
||||
return
|
||||
@@ -0,0 +1,7 @@
|
||||
py.extension_module(
|
||||
'_check_build',
|
||||
'_check_build.pyx',
|
||||
cython_args: cython_args,
|
||||
install: true,
|
||||
subdir: 'sklearn/__check_build',
|
||||
)
|
||||
154
.venv/lib/python3.12/site-packages/sklearn/__init__.py
Normal file
154
.venv/lib/python3.12/site-packages/sklearn/__init__.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""Configure global settings and get information about the working environment."""
|
||||
|
||||
# Machine learning module for Python
|
||||
# ==================================
|
||||
#
|
||||
# sklearn is a Python module integrating classical machine
|
||||
# learning algorithms in the tightly-knit world of scientific Python
|
||||
# packages (numpy, scipy, matplotlib).
|
||||
#
|
||||
# It aims to provide simple and efficient solutions to learning problems
|
||||
# that are accessible to everybody and reusable in various contexts:
|
||||
# machine-learning as a versatile tool for science and engineering.
|
||||
#
|
||||
# See https://scikit-learn.org for complete documentation.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
from ._config import config_context, get_config, set_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# PEP0440 compatible formatted version, see:
|
||||
# https://www.python.org/dev/peps/pep-0440/
|
||||
#
|
||||
# Generic release markers:
|
||||
# X.Y.0 # For first release after an increment in Y
|
||||
# X.Y.Z # For bugfix releases
|
||||
#
|
||||
# Admissible pre-release markers:
|
||||
# X.Y.ZaN # Alpha release
|
||||
# X.Y.ZbN # Beta release
|
||||
# X.Y.ZrcN # Release Candidate
|
||||
# X.Y.Z # Final release
|
||||
#
|
||||
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
|
||||
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
|
||||
#
|
||||
__version__ = "1.5.2"
|
||||
|
||||
|
||||
# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
|
||||
# simultaneously. This can happen for instance when calling BLAS inside a
|
||||
# prange. Setting the following environment variable allows multiple OpenMP
|
||||
# libraries to be loaded. It should not degrade performances since we manually
|
||||
# take care of potential over-subcription performance issues, in sections of
|
||||
# the code where nested OpenMP loops can happen, by dynamically reconfiguring
|
||||
# the inner OpenMP runtime to temporarily disable it while under the scope of
|
||||
# the outer OpenMP parallel section.
|
||||
os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
|
||||
|
||||
# Workaround issue discovered in intel-openmp 2019.5:
|
||||
# https://github.com/ContinuumIO/anaconda-issues/issues/11294
|
||||
os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
|
||||
|
||||
try:
|
||||
# This variable is injected in the __builtins__ by the build
|
||||
# process. It is used to enable importing subpackages of sklearn when
|
||||
# the binaries are not built
|
||||
# mypy error: Cannot determine type of '__SKLEARN_SETUP__'
|
||||
__SKLEARN_SETUP__ # type: ignore
|
||||
except NameError:
|
||||
__SKLEARN_SETUP__ = False
|
||||
|
||||
if __SKLEARN_SETUP__:
|
||||
sys.stderr.write("Partial import of sklearn during the build process.\n")
|
||||
# We are not importing the rest of scikit-learn during the build
|
||||
# process, as it may not be compiled yet
|
||||
else:
|
||||
# `_distributor_init` allows distributors to run custom init code.
|
||||
# For instance, for the Windows wheel, this is used to pre-load the
|
||||
# vcomp shared library runtime for OpenMP embedded in the sklearn/.libs
|
||||
# sub-folder.
|
||||
# It is necessary to do this prior to importing show_versions as the
|
||||
# later is linked to the OpenMP runtime to make it possible to introspect
|
||||
# it and importing it first would fail if the OpenMP dll cannot be found.
|
||||
from . import (
|
||||
__check_build, # noqa: F401
|
||||
_distributor_init, # noqa: F401
|
||||
)
|
||||
from .base import clone
|
||||
from .utils._show_versions import show_versions
|
||||
|
||||
__all__ = [
|
||||
"calibration",
|
||||
"cluster",
|
||||
"covariance",
|
||||
"cross_decomposition",
|
||||
"datasets",
|
||||
"decomposition",
|
||||
"dummy",
|
||||
"ensemble",
|
||||
"exceptions",
|
||||
"experimental",
|
||||
"externals",
|
||||
"feature_extraction",
|
||||
"feature_selection",
|
||||
"gaussian_process",
|
||||
"inspection",
|
||||
"isotonic",
|
||||
"kernel_approximation",
|
||||
"kernel_ridge",
|
||||
"linear_model",
|
||||
"manifold",
|
||||
"metrics",
|
||||
"mixture",
|
||||
"model_selection",
|
||||
"multiclass",
|
||||
"multioutput",
|
||||
"naive_bayes",
|
||||
"neighbors",
|
||||
"neural_network",
|
||||
"pipeline",
|
||||
"preprocessing",
|
||||
"random_projection",
|
||||
"semi_supervised",
|
||||
"svm",
|
||||
"tree",
|
||||
"discriminant_analysis",
|
||||
"impute",
|
||||
"compose",
|
||||
# Non-modules:
|
||||
"clone",
|
||||
"get_config",
|
||||
"set_config",
|
||||
"config_context",
|
||||
"show_versions",
|
||||
]
|
||||
|
||||
_BUILT_WITH_MESON = False
|
||||
try:
|
||||
import sklearn._built_with_meson # noqa: F401
|
||||
|
||||
_BUILT_WITH_MESON = True
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
|
||||
def setup_module(module):
|
||||
"""Fixture for the tests to assure globally controllable seeding of RNGs"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Check if a random seed exists in the environment, if not create one.
|
||||
_random_seed = os.environ.get("SKLEARN_SEED", None)
|
||||
if _random_seed is None:
|
||||
_random_seed = np.random.uniform() * np.iinfo(np.int32).max
|
||||
_random_seed = int(_random_seed)
|
||||
print("I: Seeding RNGs with %r" % _random_seed)
|
||||
np.random.seed(_random_seed)
|
||||
random.seed(_random_seed)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
Utilities useful during the build.
|
||||
"""
|
||||
|
||||
# author: Andy Mueller, Gael Varoquaux
|
||||
# license: BSD
|
||||
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
import sklearn
|
||||
|
||||
from .._min_dependencies import CYTHON_MIN_VERSION
|
||||
from ..externals._packaging.version import parse
|
||||
from .openmp_helpers import check_openmp_support
|
||||
from .pre_build_helpers import basic_check_build
|
||||
|
||||
DEFAULT_ROOT = "sklearn"
|
||||
|
||||
|
||||
def _check_cython_version():
|
||||
message = (
|
||||
"Please install Cython with a version >= {0} in order "
|
||||
"to build a scikit-learn from source."
|
||||
).format(CYTHON_MIN_VERSION)
|
||||
try:
|
||||
import Cython
|
||||
except ModuleNotFoundError as e:
|
||||
# Re-raise with more informative error message instead:
|
||||
raise ModuleNotFoundError(message) from e
|
||||
|
||||
if parse(Cython.__version__) < parse(CYTHON_MIN_VERSION):
|
||||
message += " The current version of Cython is {} installed in {}.".format(
|
||||
Cython.__version__, Cython.__path__
|
||||
)
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
def cythonize_extensions(extension):
|
||||
"""Check that a recent Cython is available and cythonize extensions"""
|
||||
_check_cython_version()
|
||||
from Cython.Build import cythonize
|
||||
|
||||
# Fast fail before cythonization if compiler fails compiling basic test
|
||||
# code even without OpenMP
|
||||
basic_check_build()
|
||||
|
||||
# check simple compilation with OpenMP. If it fails scikit-learn will be
|
||||
# built without OpenMP and the test test_openmp_supported in the test suite
|
||||
# will fail.
|
||||
# `check_openmp_support` compiles a small test program to see if the
|
||||
# compilers are properly configured to build with OpenMP. This is expensive
|
||||
# and we only want to call this function once.
|
||||
# The result of this check is cached as a private attribute on the sklearn
|
||||
# module (only at build-time) to be used in the build_ext subclass defined
|
||||
# in the top-level setup.py file to actually build the compiled extensions
|
||||
# with OpenMP flags if needed.
|
||||
sklearn._OPENMP_SUPPORTED = check_openmp_support()
|
||||
|
||||
n_jobs = 1
|
||||
with contextlib.suppress(ImportError):
|
||||
import joblib
|
||||
|
||||
n_jobs = joblib.cpu_count()
|
||||
|
||||
# Additional checks for Cython
|
||||
cython_enable_debug_directives = (
|
||||
os.environ.get("SKLEARN_ENABLE_DEBUG_CYTHON_DIRECTIVES", "0") != "0"
|
||||
)
|
||||
|
||||
compiler_directives = {
|
||||
"language_level": 3,
|
||||
"boundscheck": cython_enable_debug_directives,
|
||||
"wraparound": False,
|
||||
"initializedcheck": False,
|
||||
"nonecheck": False,
|
||||
"cdivision": True,
|
||||
"profile": False,
|
||||
}
|
||||
|
||||
return cythonize(
|
||||
extension,
|
||||
nthreads=n_jobs,
|
||||
compiler_directives=compiler_directives,
|
||||
annotate=False,
|
||||
)
|
||||
|
||||
|
||||
def gen_from_templates(templates):
|
||||
"""Generate cython files from a list of templates"""
|
||||
# Lazy import because cython is not a runtime dependency.
|
||||
from Cython import Tempita
|
||||
|
||||
for template in templates:
|
||||
outfile = template.replace(".tp", "")
|
||||
|
||||
# if the template is not updated, no need to output the cython file
|
||||
if not (
|
||||
os.path.exists(outfile)
|
||||
and os.stat(template).st_mtime < os.stat(outfile).st_mtime
|
||||
):
|
||||
with open(template, "r") as f:
|
||||
tmpl = f.read()
|
||||
|
||||
tmpl_ = Tempita.sub(tmpl)
|
||||
|
||||
warn_msg = (
|
||||
"# WARNING: Do not edit this file directly.\n"
|
||||
f"# It is automatically generated from {template!r}.\n"
|
||||
"# Changes must be made there.\n\n"
|
||||
)
|
||||
|
||||
with open(outfile, "w") as f:
|
||||
f.write(warn_msg)
|
||||
f.write(tmpl_)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,127 @@
|
||||
"""Helpers for OpenMP support during the build."""
|
||||
|
||||
# This code is adapted for a large part from the astropy openmp helpers, which
|
||||
# can be found at: https://github.com/astropy/extension-helpers/blob/master/extension_helpers/_openmp_helpers.py # noqa
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import textwrap
|
||||
import warnings
|
||||
|
||||
from .pre_build_helpers import compile_test_program
|
||||
|
||||
|
||||
def get_openmp_flag():
|
||||
if sys.platform == "win32":
|
||||
return ["/openmp"]
|
||||
elif sys.platform == "darwin" and "openmp" in os.getenv("CPPFLAGS", ""):
|
||||
# -fopenmp can't be passed as compile flag when using Apple-clang.
|
||||
# OpenMP support has to be enabled during preprocessing.
|
||||
#
|
||||
# For example, our macOS wheel build jobs use the following environment
|
||||
# variables to build with Apple-clang and the brew installed "libomp":
|
||||
#
|
||||
# export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp"
|
||||
# export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include"
|
||||
# export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include"
|
||||
# export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib
|
||||
# -L/usr/local/opt/libomp/lib -lomp"
|
||||
return []
|
||||
# Default flag for GCC and clang:
|
||||
return ["-fopenmp"]
|
||||
|
||||
|
||||
def check_openmp_support():
|
||||
"""Check whether OpenMP test code can be compiled and run"""
|
||||
if "PYODIDE" in os.environ:
|
||||
# Pyodide doesn't support OpenMP
|
||||
return False
|
||||
|
||||
code = textwrap.dedent(
|
||||
"""\
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
int main(void) {
|
||||
#pragma omp parallel
|
||||
printf("nthreads=%d\\n", omp_get_num_threads());
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
extra_preargs = os.getenv("LDFLAGS", None)
|
||||
if extra_preargs is not None:
|
||||
extra_preargs = extra_preargs.strip().split(" ")
|
||||
# FIXME: temporary fix to link against system libraries on linux
|
||||
# "-Wl,--sysroot=/" should be removed
|
||||
extra_preargs = [
|
||||
flag
|
||||
for flag in extra_preargs
|
||||
if flag.startswith(("-L", "-Wl,-rpath", "-l", "-Wl,--sysroot=/"))
|
||||
]
|
||||
|
||||
extra_postargs = get_openmp_flag()
|
||||
|
||||
openmp_exception = None
|
||||
try:
|
||||
output = compile_test_program(
|
||||
code, extra_preargs=extra_preargs, extra_postargs=extra_postargs
|
||||
)
|
||||
|
||||
if output and "nthreads=" in output[0]:
|
||||
nthreads = int(output[0].strip().split("=")[1])
|
||||
openmp_supported = len(output) == nthreads
|
||||
elif "PYTHON_CROSSENV" in os.environ:
|
||||
# Since we can't run the test program when cross-compiling
|
||||
# assume that openmp is supported if the program can be
|
||||
# compiled.
|
||||
openmp_supported = True
|
||||
else:
|
||||
openmp_supported = False
|
||||
|
||||
except Exception as exception:
|
||||
# We could be more specific and only catch: CompileError, LinkError,
|
||||
# and subprocess.CalledProcessError.
|
||||
# setuptools introduced CompileError and LinkError, but that requires
|
||||
# version 61.1. Even the latest version of Ubuntu (22.04LTS) only
|
||||
# ships with 59.6. So for now we catch all exceptions and reraise a
|
||||
# generic exception with the original error message instead:
|
||||
openmp_supported = False
|
||||
openmp_exception = exception
|
||||
|
||||
if not openmp_supported:
|
||||
if os.getenv("SKLEARN_FAIL_NO_OPENMP"):
|
||||
raise Exception(
|
||||
"Failed to build scikit-learn with OpenMP support"
|
||||
) from openmp_exception
|
||||
else:
|
||||
message = textwrap.dedent(
|
||||
"""
|
||||
|
||||
***********
|
||||
* WARNING *
|
||||
***********
|
||||
|
||||
It seems that scikit-learn cannot be built with OpenMP.
|
||||
|
||||
- Make sure you have followed the installation instructions:
|
||||
|
||||
https://scikit-learn.org/dev/developers/advanced_installation.html
|
||||
|
||||
- If your compiler supports OpenMP but you still see this
|
||||
message, please submit a bug report at:
|
||||
|
||||
https://github.com/scikit-learn/scikit-learn/issues
|
||||
|
||||
- The build will continue with OpenMP-based parallelism
|
||||
disabled. Note however that some estimators will run in
|
||||
sequential mode instead of leveraging thread-based
|
||||
parallelism.
|
||||
|
||||
***
|
||||
"""
|
||||
)
|
||||
warnings.warn(message)
|
||||
|
||||
return openmp_supported
|
||||
@@ -0,0 +1,75 @@
|
||||
"""Helpers to check build environment before actual build of scikit-learn"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import textwrap
|
||||
|
||||
from setuptools.command.build_ext import customize_compiler, new_compiler
|
||||
|
||||
|
||||
def compile_test_program(code, extra_preargs=None, extra_postargs=None):
|
||||
"""Check that some C code can be compiled and run"""
|
||||
ccompiler = new_compiler()
|
||||
customize_compiler(ccompiler)
|
||||
|
||||
start_dir = os.path.abspath(".")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
try:
|
||||
os.chdir(tmp_dir)
|
||||
|
||||
# Write test program
|
||||
with open("test_program.c", "w") as f:
|
||||
f.write(code)
|
||||
|
||||
os.mkdir("objects")
|
||||
|
||||
# Compile, test program
|
||||
ccompiler.compile(
|
||||
["test_program.c"], output_dir="objects", extra_postargs=extra_postargs
|
||||
)
|
||||
|
||||
# Link test program
|
||||
objects = glob.glob(os.path.join("objects", "*" + ccompiler.obj_extension))
|
||||
ccompiler.link_executable(
|
||||
objects,
|
||||
"test_program",
|
||||
extra_preargs=extra_preargs,
|
||||
extra_postargs=extra_postargs,
|
||||
)
|
||||
|
||||
if "PYTHON_CROSSENV" not in os.environ:
|
||||
# Run test program if not cross compiling
|
||||
# will raise a CalledProcessError if return code was non-zero
|
||||
output = subprocess.check_output("./test_program")
|
||||
output = output.decode(sys.stdout.encoding or "utf-8").splitlines()
|
||||
else:
|
||||
# Return an empty output if we are cross compiling
|
||||
# as we cannot run the test_program
|
||||
output = []
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
os.chdir(start_dir)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def basic_check_build():
|
||||
"""Check basic compilation and linking of C code"""
|
||||
if "PYODIDE" in os.environ:
|
||||
# The following check won't work in pyodide
|
||||
return
|
||||
|
||||
code = textwrap.dedent(
|
||||
"""\
|
||||
#include <stdio.h>
|
||||
int main(void) {
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
)
|
||||
compile_test_program(code)
|
||||
@@ -0,0 +1,57 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from Cython import Tempita as tempita
|
||||
|
||||
# XXX: If this import ever fails (does it really?), vendor either
|
||||
# cython.tempita or numpy/npy_tempita.
|
||||
|
||||
|
||||
def process_tempita(fromfile, outfile=None):
|
||||
"""Process tempita templated file and write out the result.
|
||||
|
||||
The template file is expected to end in `.c.tp` or `.pyx.tp`:
|
||||
E.g. processing `template.c.in` generates `template.c`.
|
||||
|
||||
"""
|
||||
with open(fromfile, "r", encoding="utf-8") as f:
|
||||
template_content = f.read()
|
||||
|
||||
template = tempita.Template(template_content)
|
||||
content = template.substitute()
|
||||
|
||||
with open(outfile, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("infile", type=str, help="Path to the input file")
|
||||
parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--ignore",
|
||||
type=str,
|
||||
help=(
|
||||
"An ignored input - may be useful to add a "
|
||||
"dependency between custom targets"
|
||||
),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.infile.endswith(".tp"):
|
||||
raise ValueError(f"Unexpected extension: {args.infile}")
|
||||
|
||||
if not args.outdir:
|
||||
raise ValueError("Missing `--outdir` argument to tempita.py")
|
||||
|
||||
outdir_abs = os.path.join(os.getcwd(), args.outdir)
|
||||
outfile = os.path.join(
|
||||
outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
|
||||
)
|
||||
|
||||
process_tempita(args.infile, outfile)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract version number from __init__.py"""
|
||||
|
||||
import os
|
||||
|
||||
sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
|
||||
|
||||
data = open(sklearn_init).readlines()
|
||||
version_line = next(line for line in data if line.startswith("__version__"))
|
||||
|
||||
version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
|
||||
|
||||
print(version)
|
||||
373
.venv/lib/python3.12/site-packages/sklearn/_config.py
Normal file
373
.venv/lib/python3.12/site-packages/sklearn/_config.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""Global configuration state and functions for management"""
|
||||
|
||||
import os
|
||||
import threading
|
||||
from contextlib import contextmanager as contextmanager
|
||||
|
||||
_global_config = {
|
||||
"assume_finite": bool(os.environ.get("SKLEARN_ASSUME_FINITE", False)),
|
||||
"working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
|
||||
"print_changed_only": True,
|
||||
"display": "diagram",
|
||||
"pairwise_dist_chunk_size": int(
|
||||
os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
|
||||
),
|
||||
"enable_cython_pairwise_dist": True,
|
||||
"array_api_dispatch": False,
|
||||
"transform_output": "default",
|
||||
"enable_metadata_routing": False,
|
||||
"skip_parameter_validation": False,
|
||||
}
|
||||
_threadlocal = threading.local()
|
||||
|
||||
|
||||
def _get_threadlocal_config():
|
||||
"""Get a threadlocal **mutable** configuration. If the configuration
|
||||
does not exist, copy the default global configuration."""
|
||||
if not hasattr(_threadlocal, "global_config"):
|
||||
_threadlocal.global_config = _global_config.copy()
|
||||
return _threadlocal.global_config
|
||||
|
||||
|
||||
def get_config():
|
||||
"""Retrieve current values for configuration set by :func:`set_config`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
config : dict
|
||||
Keys are parameter names that can be passed to :func:`set_config`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
config_context : Context manager for global scikit-learn configuration.
|
||||
set_config : Set global scikit-learn configuration.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import sklearn
|
||||
>>> config = sklearn.get_config()
|
||||
>>> config.keys()
|
||||
dict_keys([...])
|
||||
"""
|
||||
# Return a copy of the threadlocal configuration so that users will
|
||||
# not be able to modify the configuration with the returned dict.
|
||||
return _get_threadlocal_config().copy()
|
||||
|
||||
|
||||
def set_config(
|
||||
assume_finite=None,
|
||||
working_memory=None,
|
||||
print_changed_only=None,
|
||||
display=None,
|
||||
pairwise_dist_chunk_size=None,
|
||||
enable_cython_pairwise_dist=None,
|
||||
array_api_dispatch=None,
|
||||
transform_output=None,
|
||||
enable_metadata_routing=None,
|
||||
skip_parameter_validation=None,
|
||||
):
|
||||
"""Set global scikit-learn configuration.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
Parameters
|
||||
----------
|
||||
assume_finite : bool, default=None
|
||||
If True, validation for finiteness will be skipped,
|
||||
saving time, but leading to potential crashes. If
|
||||
False, validation for finiteness will be performed,
|
||||
avoiding error. Global default: False.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
working_memory : int, default=None
|
||||
If set, scikit-learn will attempt to limit the size of temporary arrays
|
||||
to this number of MiB (per job when parallelised), often saving both
|
||||
computation time and memory on expensive operations that can be
|
||||
performed in chunks. Global default: 1024.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
print_changed_only : bool, default=None
|
||||
If True, only the parameters that were set to non-default
|
||||
values will be printed when printing an estimator. For example,
|
||||
``print(SVC())`` while True will only print 'SVC()' while the default
|
||||
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
|
||||
all the non-changed parameters.
|
||||
|
||||
.. versionadded:: 0.21
|
||||
|
||||
display : {'text', 'diagram'}, default=None
|
||||
If 'diagram', estimators will be displayed as a diagram in a Jupyter
|
||||
lab or notebook context. If 'text', estimators will be displayed as
|
||||
text. Default is 'diagram'.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
pairwise_dist_chunk_size : int, default=None
|
||||
The number of row vectors per chunk for the accelerated pairwise-
|
||||
distances reduction backend. Default is 256 (suitable for most of
|
||||
modern laptops' caches and architectures).
|
||||
|
||||
Intended for easier benchmarking and testing of scikit-learn internals.
|
||||
End users are not expected to benefit from customizing this configuration
|
||||
setting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
enable_cython_pairwise_dist : bool, default=None
|
||||
Use the accelerated pairwise-distances reduction backend when
|
||||
possible. Global default: True.
|
||||
|
||||
Intended for easier benchmarking and testing of scikit-learn internals.
|
||||
End users are not expected to benefit from customizing this configuration
|
||||
setting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
array_api_dispatch : bool, default=None
|
||||
Use Array API dispatching when inputs follow the Array API standard.
|
||||
Default is False.
|
||||
|
||||
See the :ref:`User Guide <array_api>` for more details.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
transform_output : str, default=None
|
||||
Configure output of `transform` and `fit_transform`.
|
||||
|
||||
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
|
||||
for an example on how to use the API.
|
||||
|
||||
- `"default"`: Default output format of a transformer
|
||||
- `"pandas"`: DataFrame output
|
||||
- `"polars"`: Polars output
|
||||
- `None`: Transform configuration is unchanged
|
||||
|
||||
.. versionadded:: 1.2
|
||||
.. versionadded:: 1.4
|
||||
`"polars"` option was added.
|
||||
|
||||
enable_metadata_routing : bool, default=None
|
||||
Enable metadata routing. By default this feature is disabled.
|
||||
|
||||
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
- `True`: Metadata routing is enabled
|
||||
- `False`: Metadata routing is disabled, use the old syntax.
|
||||
- `None`: Configuration is unchanged
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
skip_parameter_validation : bool, default=None
|
||||
If `True`, disable the validation of the hyper-parameters' types and values in
|
||||
the fit method of estimators and for arguments passed to public helper
|
||||
functions. It can save time in some situations but can lead to low level
|
||||
crashes and exceptions with confusing error messages.
|
||||
|
||||
Note that for data parameters, such as `X` and `y`, only type validation is
|
||||
skipped but validation with `check_array` will continue to run.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
See Also
|
||||
--------
|
||||
config_context : Context manager for global scikit-learn configuration.
|
||||
get_config : Retrieve current values of the global configuration.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import set_config
|
||||
>>> set_config(display='diagram') # doctest: +SKIP
|
||||
"""
|
||||
local_config = _get_threadlocal_config()
|
||||
|
||||
if assume_finite is not None:
|
||||
local_config["assume_finite"] = assume_finite
|
||||
if working_memory is not None:
|
||||
local_config["working_memory"] = working_memory
|
||||
if print_changed_only is not None:
|
||||
local_config["print_changed_only"] = print_changed_only
|
||||
if display is not None:
|
||||
local_config["display"] = display
|
||||
if pairwise_dist_chunk_size is not None:
|
||||
local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size
|
||||
if enable_cython_pairwise_dist is not None:
|
||||
local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist
|
||||
if array_api_dispatch is not None:
|
||||
from .utils._array_api import _check_array_api_dispatch
|
||||
|
||||
_check_array_api_dispatch(array_api_dispatch)
|
||||
local_config["array_api_dispatch"] = array_api_dispatch
|
||||
if transform_output is not None:
|
||||
local_config["transform_output"] = transform_output
|
||||
if enable_metadata_routing is not None:
|
||||
local_config["enable_metadata_routing"] = enable_metadata_routing
|
||||
if skip_parameter_validation is not None:
|
||||
local_config["skip_parameter_validation"] = skip_parameter_validation
|
||||
|
||||
|
||||
@contextmanager
|
||||
def config_context(
|
||||
*,
|
||||
assume_finite=None,
|
||||
working_memory=None,
|
||||
print_changed_only=None,
|
||||
display=None,
|
||||
pairwise_dist_chunk_size=None,
|
||||
enable_cython_pairwise_dist=None,
|
||||
array_api_dispatch=None,
|
||||
transform_output=None,
|
||||
enable_metadata_routing=None,
|
||||
skip_parameter_validation=None,
|
||||
):
|
||||
"""Context manager for global scikit-learn configuration.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
assume_finite : bool, default=None
|
||||
If True, validation for finiteness will be skipped,
|
||||
saving time, but leading to potential crashes. If
|
||||
False, validation for finiteness will be performed,
|
||||
avoiding error. If None, the existing value won't change.
|
||||
The default value is False.
|
||||
|
||||
working_memory : int, default=None
|
||||
If set, scikit-learn will attempt to limit the size of temporary arrays
|
||||
to this number of MiB (per job when parallelised), often saving both
|
||||
computation time and memory on expensive operations that can be
|
||||
performed in chunks. If None, the existing value won't change.
|
||||
The default value is 1024.
|
||||
|
||||
print_changed_only : bool, default=None
|
||||
If True, only the parameters that were set to non-default
|
||||
values will be printed when printing an estimator. For example,
|
||||
``print(SVC())`` while True will only print 'SVC()', but would print
|
||||
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
|
||||
when False. If None, the existing value won't change.
|
||||
The default value is True.
|
||||
|
||||
.. versionchanged:: 0.23
|
||||
Default changed from False to True.
|
||||
|
||||
display : {'text', 'diagram'}, default=None
|
||||
If 'diagram', estimators will be displayed as a diagram in a Jupyter
|
||||
lab or notebook context. If 'text', estimators will be displayed as
|
||||
text. If None, the existing value won't change.
|
||||
The default value is 'diagram'.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
|
||||
pairwise_dist_chunk_size : int, default=None
|
||||
The number of row vectors per chunk for the accelerated pairwise-
|
||||
distances reduction backend. Default is 256 (suitable for most of
|
||||
modern laptops' caches and architectures).
|
||||
|
||||
Intended for easier benchmarking and testing of scikit-learn internals.
|
||||
End users are not expected to benefit from customizing this configuration
|
||||
setting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
enable_cython_pairwise_dist : bool, default=None
|
||||
Use the accelerated pairwise-distances reduction backend when
|
||||
possible. Global default: True.
|
||||
|
||||
Intended for easier benchmarking and testing of scikit-learn internals.
|
||||
End users are not expected to benefit from customizing this configuration
|
||||
setting.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
array_api_dispatch : bool, default=None
|
||||
Use Array API dispatching when inputs follow the Array API standard.
|
||||
Default is False.
|
||||
|
||||
See the :ref:`User Guide <array_api>` for more details.
|
||||
|
||||
.. versionadded:: 1.2
|
||||
|
||||
transform_output : str, default=None
|
||||
Configure output of `transform` and `fit_transform`.
|
||||
|
||||
See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
|
||||
for an example on how to use the API.
|
||||
|
||||
- `"default"`: Default output format of a transformer
|
||||
- `"pandas"`: DataFrame output
|
||||
- `"polars"`: Polars output
|
||||
- `None`: Transform configuration is unchanged
|
||||
|
||||
.. versionadded:: 1.2
|
||||
.. versionadded:: 1.4
|
||||
`"polars"` option was added.
|
||||
|
||||
enable_metadata_routing : bool, default=None
|
||||
Enable metadata routing. By default this feature is disabled.
|
||||
|
||||
Refer to :ref:`metadata routing user guide <metadata_routing>` for more
|
||||
details.
|
||||
|
||||
- `True`: Metadata routing is enabled
|
||||
- `False`: Metadata routing is disabled, use the old syntax.
|
||||
- `None`: Configuration is unchanged
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
skip_parameter_validation : bool, default=None
|
||||
If `True`, disable the validation of the hyper-parameters' types and values in
|
||||
the fit method of estimators and for arguments passed to public helper
|
||||
functions. It can save time in some situations but can lead to low level
|
||||
crashes and exceptions with confusing error messages.
|
||||
|
||||
Note that for data parameters, such as `X` and `y`, only type validation is
|
||||
skipped but validation with `check_array` will continue to run.
|
||||
|
||||
.. versionadded:: 1.3
|
||||
|
||||
Yields
|
||||
------
|
||||
None.
|
||||
|
||||
See Also
|
||||
--------
|
||||
set_config : Set global scikit-learn configuration.
|
||||
get_config : Retrieve current values of the global configuration.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All settings, not just those presently modified, will be returned to
|
||||
their previous values when the context manager is exited.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import sklearn
|
||||
>>> from sklearn.utils.validation import assert_all_finite
|
||||
>>> with sklearn.config_context(assume_finite=True):
|
||||
... assert_all_finite([float('nan')])
|
||||
>>> with sklearn.config_context(assume_finite=True):
|
||||
... with sklearn.config_context(assume_finite=False):
|
||||
... assert_all_finite([float('nan')])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Input contains NaN...
|
||||
"""
|
||||
old_config = get_config()
|
||||
set_config(
|
||||
assume_finite=assume_finite,
|
||||
working_memory=working_memory,
|
||||
print_changed_only=print_changed_only,
|
||||
display=display,
|
||||
pairwise_dist_chunk_size=pairwise_dist_chunk_size,
|
||||
enable_cython_pairwise_dist=enable_cython_pairwise_dist,
|
||||
array_api_dispatch=array_api_dispatch,
|
||||
transform_output=transform_output,
|
||||
enable_metadata_routing=enable_metadata_routing,
|
||||
skip_parameter_validation=skip_parameter_validation,
|
||||
)
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
set_config(**old_config)
|
||||
@@ -0,0 +1,10 @@
|
||||
"""Distributor init file
|
||||
|
||||
Distributors: you can add custom code here to support particular distributions
|
||||
of scikit-learn.
|
||||
|
||||
For example, this is a good place to put any checks for hardware requirements.
|
||||
|
||||
The scikit-learn standard source distribution will not put code in this file,
|
||||
so you can safely replace this file with your own version.
|
||||
"""
|
||||
Binary file not shown.
115
.venv/lib/python3.12/site-packages/sklearn/_isotonic.pyx
Normal file
115
.venv/lib/python3.12/site-packages/sklearn/_isotonic.pyx
Normal file
@@ -0,0 +1,115 @@
|
||||
# Author: Nelle Varoquaux, Andrew Tulloch, Antony Lee
|
||||
|
||||
# Uses the pool adjacent violators algorithm (PAVA), with the
|
||||
# enhancement of searching for the longest decreasing subsequence to
|
||||
# pool at each step.
|
||||
|
||||
import numpy as np
|
||||
from cython cimport floating
|
||||
|
||||
|
||||
def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
|
||||
cdef:
|
||||
Py_ssize_t n = y.shape[0], i, k
|
||||
floating prev_y, sum_wy, sum_w
|
||||
Py_ssize_t[::1] target = np.arange(n, dtype=np.intp)
|
||||
|
||||
# target describes a list of blocks. At any time, if [i..j] (inclusive) is
|
||||
# an active block, then target[i] := j and target[j] := i.
|
||||
|
||||
# For "active" indices (block starts):
|
||||
# w[i] := sum{w_orig[j], j=[i..target[i]]}
|
||||
# y[i] := sum{y_orig[j]*w_orig[j], j=[i..target[i]]} / w[i]
|
||||
|
||||
with nogil:
|
||||
i = 0
|
||||
while i < n:
|
||||
k = target[i] + 1
|
||||
if k == n:
|
||||
break
|
||||
if y[i] < y[k]:
|
||||
i = k
|
||||
continue
|
||||
sum_wy = w[i] * y[i]
|
||||
sum_w = w[i]
|
||||
while True:
|
||||
# We are within a decreasing subsequence.
|
||||
prev_y = y[k]
|
||||
sum_wy += w[k] * y[k]
|
||||
sum_w += w[k]
|
||||
k = target[k] + 1
|
||||
if k == n or prev_y < y[k]:
|
||||
# Non-singleton decreasing subsequence is finished,
|
||||
# update first entry.
|
||||
y[i] = sum_wy / sum_w
|
||||
w[i] = sum_w
|
||||
target[i] = k - 1
|
||||
target[k - 1] = i
|
||||
if i > 0:
|
||||
# Backtrack if we can. This makes the algorithm
|
||||
# single-pass and ensures O(n) complexity.
|
||||
i = target[i - 1]
|
||||
# Otherwise, restart from the same point.
|
||||
break
|
||||
# Reconstruct the solution.
|
||||
i = 0
|
||||
while i < n:
|
||||
k = target[i] + 1
|
||||
y[i + 1 : k] = y[i]
|
||||
i = k
|
||||
|
||||
|
||||
def _make_unique(const floating[::1] X,
|
||||
const floating[::1] y,
|
||||
const floating[::1] sample_weights):
|
||||
"""Average targets for duplicate X, drop duplicates.
|
||||
|
||||
Aggregates duplicate X values into a single X value where
|
||||
the target y is a (sample_weighted) average of the individual
|
||||
targets.
|
||||
|
||||
Assumes that X is ordered, so that all duplicates follow each other.
|
||||
"""
|
||||
unique_values = len(np.unique(X))
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
cdef floating[::1] y_out = np.empty(unique_values, dtype=dtype)
|
||||
cdef floating[::1] x_out = np.empty_like(y_out)
|
||||
cdef floating[::1] weights_out = np.empty_like(y_out)
|
||||
|
||||
cdef floating current_x = X[0]
|
||||
cdef floating current_y = 0
|
||||
cdef floating current_weight = 0
|
||||
cdef int i = 0
|
||||
cdef int j
|
||||
cdef floating x
|
||||
cdef int n_samples = len(X)
|
||||
cdef floating eps = np.finfo(dtype).resolution
|
||||
|
||||
for j in range(n_samples):
|
||||
x = X[j]
|
||||
if x - current_x >= eps:
|
||||
# next unique value
|
||||
x_out[i] = current_x
|
||||
weights_out[i] = current_weight
|
||||
y_out[i] = current_y / current_weight
|
||||
i += 1
|
||||
current_x = x
|
||||
current_weight = sample_weights[j]
|
||||
current_y = y[j] * sample_weights[j]
|
||||
else:
|
||||
current_weight += sample_weights[j]
|
||||
current_y += y[j] * sample_weights[j]
|
||||
|
||||
x_out[i] = current_x
|
||||
weights_out[i] = current_weight
|
||||
y_out[i] = current_y / current_weight
|
||||
return(
|
||||
np.asarray(x_out[:i+1]),
|
||||
np.asarray(y_out[:i+1]),
|
||||
np.asarray(weights_out[:i+1]),
|
||||
)
|
||||
30
.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py
Normal file
30
.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""
|
||||
The :mod:`sklearn._loss` module includes loss function classes suitable for
|
||||
fitting classification and regression tasks.
|
||||
"""
|
||||
|
||||
from .loss import (
|
||||
AbsoluteError,
|
||||
HalfBinomialLoss,
|
||||
HalfGammaLoss,
|
||||
HalfMultinomialLoss,
|
||||
HalfPoissonLoss,
|
||||
HalfSquaredError,
|
||||
HalfTweedieLoss,
|
||||
HalfTweedieLossIdentity,
|
||||
HuberLoss,
|
||||
PinballLoss,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"HalfSquaredError",
|
||||
"AbsoluteError",
|
||||
"PinballLoss",
|
||||
"HuberLoss",
|
||||
"HalfPoissonLoss",
|
||||
"HalfGammaLoss",
|
||||
"HalfTweedieLoss",
|
||||
"HalfTweedieLossIdentity",
|
||||
"HalfBinomialLoss",
|
||||
"HalfMultinomialLoss",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
91
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd
Normal file
91
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd
Normal file
@@ -0,0 +1,91 @@
|
||||
# Fused types for input like y_true, raw_prediction, sample_weights.
|
||||
ctypedef fused floating_in:
|
||||
double
|
||||
float
|
||||
|
||||
|
||||
# Fused types for output like gradient and hessian
|
||||
# We use a different fused types for input (floating_in) and output (floating_out), such
|
||||
# that input and output can have different dtypes in the same function call. A single
|
||||
# fused type can only take on one single value (type) for all arguments in one function
|
||||
# call.
|
||||
ctypedef fused floating_out:
|
||||
double
|
||||
float
|
||||
|
||||
|
||||
# Struct to return 2 doubles
|
||||
ctypedef struct double_pair:
|
||||
double val1
|
||||
double val2
|
||||
|
||||
|
||||
# C base class for loss functions
|
||||
cdef class CyLossFunction:
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfSquaredError(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyAbsoluteError(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyPinballLoss(CyLossFunction):
|
||||
cdef readonly double quantile # readonly makes it accessible from Python
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHuberLoss(CyLossFunction):
|
||||
cdef public double delta # public makes it accessible from Python
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfPoissonLoss(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfGammaLoss(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfTweedieLoss(CyLossFunction):
|
||||
cdef readonly double power # readonly makes it accessible from Python
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfTweedieLossIdentity(CyLossFunction):
|
||||
cdef readonly double power # readonly makes it accessible from Python
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyHalfBinomialLoss(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
|
||||
|
||||
cdef class CyExponentialLoss(CyLossFunction):
|
||||
cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
|
||||
cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
|
||||
1418
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp
Normal file
1418
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp
Normal file
File diff suppressed because it is too large
Load Diff
281
.venv/lib/python3.12/site-packages/sklearn/_loss/link.py
Normal file
281
.venv/lib/python3.12/site-packages/sklearn/_loss/link.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Module contains classes for invertible (and differentiable) link functions.
|
||||
"""
|
||||
|
||||
# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import expit, logit
|
||||
from scipy.stats import gmean
|
||||
|
||||
from ..utils.extmath import softmax
|
||||
|
||||
|
||||
@dataclass
|
||||
class Interval:
|
||||
low: float
|
||||
high: float
|
||||
low_inclusive: bool
|
||||
high_inclusive: bool
|
||||
|
||||
def __post_init__(self):
|
||||
"""Check that low <= high"""
|
||||
if self.low > self.high:
|
||||
raise ValueError(
|
||||
f"One must have low <= high; got low={self.low}, high={self.high}."
|
||||
)
|
||||
|
||||
def includes(self, x):
|
||||
"""Test whether all values of x are in interval range.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : ndarray
|
||||
Array whose elements are tested to be in interval range.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : bool
|
||||
"""
|
||||
if self.low_inclusive:
|
||||
low = np.greater_equal(x, self.low)
|
||||
else:
|
||||
low = np.greater(x, self.low)
|
||||
|
||||
if not np.all(low):
|
||||
return False
|
||||
|
||||
if self.high_inclusive:
|
||||
high = np.less_equal(x, self.high)
|
||||
else:
|
||||
high = np.less(x, self.high)
|
||||
|
||||
# Note: np.all returns numpy.bool_
|
||||
return bool(np.all(high))
|
||||
|
||||
|
||||
def _inclusive_low_high(interval, dtype=np.float64):
|
||||
"""Generate values low and high to be within the interval range.
|
||||
|
||||
This is used in tests only.
|
||||
|
||||
Returns
|
||||
-------
|
||||
low, high : tuple
|
||||
The returned values low and high lie within the interval.
|
||||
"""
|
||||
eps = 10 * np.finfo(dtype).eps
|
||||
if interval.low == -np.inf:
|
||||
low = -1e10
|
||||
elif interval.low < 0:
|
||||
low = interval.low * (1 - eps) + eps
|
||||
else:
|
||||
low = interval.low * (1 + eps) + eps
|
||||
|
||||
if interval.high == np.inf:
|
||||
high = 1e10
|
||||
elif interval.high < 0:
|
||||
high = interval.high * (1 + eps) - eps
|
||||
else:
|
||||
high = interval.high * (1 - eps) - eps
|
||||
|
||||
return low, high
|
||||
|
||||
|
||||
class BaseLink(ABC):
|
||||
"""Abstract base class for differentiable, invertible link functions.
|
||||
|
||||
Convention:
|
||||
- link function g: raw_prediction = g(y_pred)
|
||||
- inverse link h: y_pred = h(raw_prediction)
|
||||
|
||||
For (generalized) linear models, `raw_prediction = X @ coef` is the so
|
||||
called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
|
||||
conditional (on X) expected value of the target `y_true`.
|
||||
|
||||
The methods are not implemented as staticmethods in case a link function needs
|
||||
parameters.
|
||||
"""
|
||||
|
||||
is_multiclass = False # used for testing only
|
||||
|
||||
# Usually, raw_prediction may be any real number and y_pred is an open
|
||||
# interval.
|
||||
# interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
|
||||
interval_y_pred = Interval(-np.inf, np.inf, False, False)
|
||||
|
||||
@abstractmethod
|
||||
def link(self, y_pred, out=None):
|
||||
"""Compute the link function g(y_pred).
|
||||
|
||||
The link function maps (predicted) target values to raw predictions,
|
||||
i.e. `g(y_pred) = raw_prediction`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y_pred : array
|
||||
Predicted target values.
|
||||
out : array
|
||||
A location into which the result is stored. If provided, it must
|
||||
have a shape that the inputs broadcast to. If not provided or None,
|
||||
a freshly-allocated array is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : array
|
||||
Output array, element-wise link function.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def inverse(self, raw_prediction, out=None):
|
||||
"""Compute the inverse link function h(raw_prediction).
|
||||
|
||||
The inverse link function maps raw predictions to predicted target
|
||||
values, i.e. `h(raw_prediction) = y_pred`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_prediction : array
|
||||
Raw prediction values (in link space).
|
||||
out : array
|
||||
A location into which the result is stored. If provided, it must
|
||||
have a shape that the inputs broadcast to. If not provided or None,
|
||||
a freshly-allocated array is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
out : array
|
||||
Output array, element-wise inverse link function.
|
||||
"""
|
||||
|
||||
|
||||
class IdentityLink(BaseLink):
|
||||
"""The identity link function g(x)=x."""
|
||||
|
||||
def link(self, y_pred, out=None):
|
||||
if out is not None:
|
||||
np.copyto(out, y_pred)
|
||||
return out
|
||||
else:
|
||||
return y_pred
|
||||
|
||||
inverse = link
|
||||
|
||||
|
||||
class LogLink(BaseLink):
|
||||
"""The log link function g(x)=log(x)."""
|
||||
|
||||
interval_y_pred = Interval(0, np.inf, False, False)
|
||||
|
||||
def link(self, y_pred, out=None):
|
||||
return np.log(y_pred, out=out)
|
||||
|
||||
def inverse(self, raw_prediction, out=None):
|
||||
return np.exp(raw_prediction, out=out)
|
||||
|
||||
|
||||
class LogitLink(BaseLink):
|
||||
"""The logit link function g(x)=logit(x)."""
|
||||
|
||||
interval_y_pred = Interval(0, 1, False, False)
|
||||
|
||||
def link(self, y_pred, out=None):
|
||||
return logit(y_pred, out=out)
|
||||
|
||||
def inverse(self, raw_prediction, out=None):
|
||||
return expit(raw_prediction, out=out)
|
||||
|
||||
|
||||
class HalfLogitLink(BaseLink):
|
||||
"""Half the logit link function g(x)=1/2 * logit(x).
|
||||
|
||||
Used for the exponential loss.
|
||||
"""
|
||||
|
||||
interval_y_pred = Interval(0, 1, False, False)
|
||||
|
||||
def link(self, y_pred, out=None):
|
||||
out = logit(y_pred, out=out)
|
||||
out *= 0.5
|
||||
return out
|
||||
|
||||
def inverse(self, raw_prediction, out=None):
|
||||
return expit(2 * raw_prediction, out)
|
||||
|
||||
|
||||
class MultinomialLogit(BaseLink):
|
||||
"""The symmetric multinomial logit function.
|
||||
|
||||
Convention:
|
||||
- y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
|
||||
|
||||
Notes:
|
||||
- The inverse link h is the softmax function.
|
||||
- The sum is over the second axis, i.e. axis=1 (n_classes).
|
||||
|
||||
We have to choose additional constraints in order to make
|
||||
|
||||
y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
|
||||
|
||||
for n_classes classes identifiable and invertible.
|
||||
We choose the symmetric side constraint where the geometric mean response
|
||||
is set as reference category, see [2]:
|
||||
|
||||
The symmetric multinomial logit link function for a single data point is
|
||||
then defined as
|
||||
|
||||
raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
|
||||
= log(y_pred[k]) - mean(log(y_pred)).
|
||||
|
||||
Note that this is equivalent to the definition in [1] and implies mean
|
||||
centered raw predictions:
|
||||
|
||||
sum(raw_prediction[k], k=0..n_classes-1) = 0.
|
||||
|
||||
For linear models with raw_prediction = X @ coef, this corresponds to
|
||||
sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
|
||||
feature is zero.
|
||||
|
||||
Reference
|
||||
---------
|
||||
.. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
|
||||
logistic regression: a statistical view of boosting" Ann. Statist.
|
||||
28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
|
||||
https://projecteuclid.org/euclid.aos/1016218223
|
||||
|
||||
.. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
|
||||
multinomial logit models with symmetric side constraints."
|
||||
Computational Statistics 28 (2013): 1017-1034.
|
||||
http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
|
||||
"""
|
||||
|
||||
is_multiclass = True
|
||||
interval_y_pred = Interval(0, 1, False, False)
|
||||
|
||||
def symmetrize_raw_prediction(self, raw_prediction):
|
||||
return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
|
||||
|
||||
def link(self, y_pred, out=None):
|
||||
# geometric mean as reference category
|
||||
gm = gmean(y_pred, axis=1)
|
||||
return np.log(y_pred / gm[:, np.newaxis], out=out)
|
||||
|
||||
def inverse(self, raw_prediction, out=None):
|
||||
if out is None:
|
||||
return softmax(raw_prediction, copy=True)
|
||||
else:
|
||||
np.copyto(out, raw_prediction)
|
||||
softmax(out, copy=False)
|
||||
return out
|
||||
|
||||
|
||||
_LINKS = {
|
||||
"identity": IdentityLink,
|
||||
"log": LogLink,
|
||||
"logit": LogitLink,
|
||||
"half_logit": HalfLogitLink,
|
||||
"multinomial_logit": MultinomialLogit,
|
||||
}
|
||||
1178
.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py
Normal file
1178
.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py
Normal file
File diff suppressed because it is too large
Load Diff
24
.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build
Normal file
24
.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build
Normal file
@@ -0,0 +1,24 @@
|
||||
# .pyx is generated, so this is needed to make Cython compilation work
|
||||
_loss_cython_tree = [
|
||||
fs.copyfile('_loss.pxd')
|
||||
]
|
||||
|
||||
_loss_pyx = custom_target(
|
||||
'_loss_pyx',
|
||||
output: '_loss.pyx',
|
||||
input: '_loss.pyx.tp',
|
||||
command: [py, tempita, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
# TODO in principle this should go in py.exension_module below. This is
|
||||
# temporary work-around for dependency issue with .pyx.tp files. For more
|
||||
# details, see https://github.com/mesonbuild/meson/issues/13212
|
||||
depends: _loss_cython_tree,
|
||||
)
|
||||
|
||||
py.extension_module(
|
||||
'_loss',
|
||||
_loss_pyx,
|
||||
dependencies: [openmp_dep],
|
||||
cython_args: cython_args,
|
||||
install: true,
|
||||
subdir: 'sklearn/_loss',
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,111 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_array_equal
|
||||
|
||||
from sklearn._loss.link import (
|
||||
_LINKS,
|
||||
HalfLogitLink,
|
||||
Interval,
|
||||
MultinomialLogit,
|
||||
_inclusive_low_high,
|
||||
)
|
||||
|
||||
LINK_FUNCTIONS = list(_LINKS.values())
|
||||
|
||||
|
||||
def test_interval_raises():
|
||||
"""Test that interval with low > high raises ValueError."""
|
||||
with pytest.raises(
|
||||
ValueError, match="One must have low <= high; got low=1, high=0."
|
||||
):
|
||||
Interval(1, 0, False, False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interval",
|
||||
[
|
||||
Interval(0, 1, False, False),
|
||||
Interval(0, 1, False, True),
|
||||
Interval(0, 1, True, False),
|
||||
Interval(0, 1, True, True),
|
||||
Interval(-np.inf, np.inf, False, False),
|
||||
Interval(-np.inf, np.inf, False, True),
|
||||
Interval(-np.inf, np.inf, True, False),
|
||||
Interval(-np.inf, np.inf, True, True),
|
||||
Interval(-10, -1, False, False),
|
||||
Interval(-10, -1, False, True),
|
||||
Interval(-10, -1, True, False),
|
||||
Interval(-10, -1, True, True),
|
||||
],
|
||||
)
|
||||
def test_is_in_range(interval):
|
||||
# make sure low and high are always within the interval, used for linspace
|
||||
low, high = _inclusive_low_high(interval)
|
||||
|
||||
x = np.linspace(low, high, num=10)
|
||||
assert interval.includes(x)
|
||||
|
||||
# x contains lower bound
|
||||
assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
|
||||
|
||||
# x contains upper bound
|
||||
assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
|
||||
|
||||
# x contains upper and lower bound
|
||||
assert interval.includes(np.r_[x, interval.low, interval.high]) == (
|
||||
interval.low_inclusive and interval.high_inclusive
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
|
||||
def test_link_inverse_identity(link, global_random_seed):
|
||||
# Test that link of inverse gives identity.
|
||||
rng = np.random.RandomState(global_random_seed)
|
||||
link = link()
|
||||
n_samples, n_classes = 100, None
|
||||
# The values for `raw_prediction` are limited from -20 to 20 because in the
|
||||
# class `LogitLink` the term `expit(x)` comes very close to 1 for large
|
||||
# positive x and therefore loses precision.
|
||||
if link.is_multiclass:
|
||||
n_classes = 10
|
||||
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples, n_classes))
|
||||
if isinstance(link, MultinomialLogit):
|
||||
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
|
||||
elif isinstance(link, HalfLogitLink):
|
||||
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
|
||||
else:
|
||||
raw_prediction = rng.uniform(low=-20, high=20, size=(n_samples))
|
||||
|
||||
assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
|
||||
y_pred = link.inverse(raw_prediction)
|
||||
assert_allclose(link.inverse(link.link(y_pred)), y_pred)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("link", LINK_FUNCTIONS)
|
||||
def test_link_out_argument(link):
|
||||
# Test that out argument gets assigned the result.
|
||||
rng = np.random.RandomState(42)
|
||||
link = link()
|
||||
n_samples, n_classes = 100, None
|
||||
if link.is_multiclass:
|
||||
n_classes = 10
|
||||
raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
|
||||
if isinstance(link, MultinomialLogit):
|
||||
raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
|
||||
else:
|
||||
# So far, the valid interval of raw_prediction is (-inf, inf) and
|
||||
# we do not need to distinguish.
|
||||
raw_prediction = rng.uniform(low=-10, high=10, size=(n_samples))
|
||||
|
||||
y_pred = link.inverse(raw_prediction, out=None)
|
||||
out = np.empty_like(raw_prediction)
|
||||
y_pred_2 = link.inverse(raw_prediction, out=out)
|
||||
assert_allclose(y_pred, out)
|
||||
assert_array_equal(out, y_pred_2)
|
||||
assert np.shares_memory(out, y_pred_2)
|
||||
|
||||
out = np.empty_like(y_pred)
|
||||
raw_prediction_2 = link.link(y_pred, out=out)
|
||||
assert_allclose(raw_prediction, out)
|
||||
assert_array_equal(out, raw_prediction_2)
|
||||
assert np.shares_memory(out, raw_prediction_2)
|
||||
1322
.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py
Normal file
1322
.venv/lib/python3.12/site-packages/sklearn/_loss/tests/test_loss.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,71 @@
|
||||
"""All minimum dependencies for scikit-learn."""
|
||||
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
|
||||
# scipy and cython should by in sync with pyproject.toml
|
||||
NUMPY_MIN_VERSION = "1.19.5"
|
||||
SCIPY_MIN_VERSION = "1.6.0"
|
||||
JOBLIB_MIN_VERSION = "1.2.0"
|
||||
THREADPOOLCTL_MIN_VERSION = "3.1.0"
|
||||
PYTEST_MIN_VERSION = "7.1.2"
|
||||
CYTHON_MIN_VERSION = "3.0.10"
|
||||
|
||||
|
||||
# 'build' and 'install' is included to have structured metadata for CI.
|
||||
# It will NOT be included in setup's extras_require
|
||||
# The values are (version_spec, comma separated tags)
|
||||
dependent_packages = {
|
||||
"numpy": (NUMPY_MIN_VERSION, "build, install"),
|
||||
"scipy": (SCIPY_MIN_VERSION, "build, install"),
|
||||
"joblib": (JOBLIB_MIN_VERSION, "install"),
|
||||
"threadpoolctl": (THREADPOOLCTL_MIN_VERSION, "install"),
|
||||
"cython": (CYTHON_MIN_VERSION, "build"),
|
||||
"meson-python": ("0.16.0", "build"),
|
||||
"matplotlib": ("3.3.4", "benchmark, docs, examples, tests"),
|
||||
"scikit-image": ("0.17.2", "docs, examples, tests"),
|
||||
"pandas": ("1.1.5", "benchmark, docs, examples, tests"),
|
||||
"seaborn": ("0.9.0", "docs, examples"),
|
||||
"memory_profiler": ("0.57.0", "benchmark, docs"),
|
||||
"pytest": (PYTEST_MIN_VERSION, "tests"),
|
||||
"pytest-cov": ("2.9.0", "tests"),
|
||||
"ruff": ("0.2.1", "tests"),
|
||||
"black": ("24.3.0", "tests"),
|
||||
"mypy": ("1.9", "tests"),
|
||||
"pyamg": ("4.0.0", "tests"),
|
||||
"polars": ("0.20.30", "docs, tests"),
|
||||
"pyarrow": ("12.0.0", "tests"),
|
||||
"sphinx": ("7.3.7", "docs"),
|
||||
"sphinx-copybutton": ("0.5.2", "docs"),
|
||||
"sphinx-gallery": ("0.16.0", "docs"),
|
||||
"numpydoc": ("1.2.0", "docs, tests"),
|
||||
"Pillow": ("7.1.2", "docs"),
|
||||
"pooch": ("1.6.0", "docs, examples, tests"),
|
||||
"sphinx-prompt": ("1.4.0", "docs"),
|
||||
"sphinxext-opengraph": ("0.9.1", "docs"),
|
||||
"plotly": ("5.14.0", "docs, examples"),
|
||||
"sphinxcontrib-sass": ("0.3.4", "docs"),
|
||||
"sphinx-remove-toctrees": ("1.0.0.post1", "docs"),
|
||||
"sphinx-design": ("0.6.0", "docs"),
|
||||
"pydata-sphinx-theme": ("0.15.3", "docs"),
|
||||
# XXX: Pin conda-lock to the latest released version (needs manual update
|
||||
# from time to time)
|
||||
"conda-lock": ("2.5.6", "maintenance"),
|
||||
}
|
||||
|
||||
|
||||
# create inverse mapping for setuptools
|
||||
tag_to_packages: dict = defaultdict(list)
|
||||
for package, (min_version, extras) in dependent_packages.items():
|
||||
for extra in extras.split(", "):
|
||||
tag_to_packages[extra].append("{}>={}".format(package, min_version))
|
||||
|
||||
|
||||
# Used by CI to get the min dependencies
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Get min dependencies for a package")
|
||||
|
||||
parser.add_argument("package", choices=dependent_packages)
|
||||
args = parser.parse_args()
|
||||
min_version = dependent_packages[args.package][0]
|
||||
print(min_version)
|
||||
1477
.venv/lib/python3.12/site-packages/sklearn/base.py
Normal file
1477
.venv/lib/python3.12/site-packages/sklearn/base.py
Normal file
File diff suppressed because it is too large
Load Diff
1423
.venv/lib/python3.12/site-packages/sklearn/calibration.py
Normal file
1423
.venv/lib/python3.12/site-packages/sklearn/calibration.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,53 @@
|
||||
"""Popular unsupervised clustering algorithms."""
|
||||
|
||||
from ._affinity_propagation import AffinityPropagation, affinity_propagation
|
||||
from ._agglomerative import (
|
||||
AgglomerativeClustering,
|
||||
FeatureAgglomeration,
|
||||
linkage_tree,
|
||||
ward_tree,
|
||||
)
|
||||
from ._bicluster import SpectralBiclustering, SpectralCoclustering
|
||||
from ._birch import Birch
|
||||
from ._bisect_k_means import BisectingKMeans
|
||||
from ._dbscan import DBSCAN, dbscan
|
||||
from ._hdbscan.hdbscan import HDBSCAN
|
||||
from ._kmeans import KMeans, MiniBatchKMeans, k_means, kmeans_plusplus
|
||||
from ._mean_shift import MeanShift, estimate_bandwidth, get_bin_seeds, mean_shift
|
||||
from ._optics import (
|
||||
OPTICS,
|
||||
cluster_optics_dbscan,
|
||||
cluster_optics_xi,
|
||||
compute_optics_graph,
|
||||
)
|
||||
from ._spectral import SpectralClustering, spectral_clustering
|
||||
|
||||
__all__ = [
|
||||
"AffinityPropagation",
|
||||
"AgglomerativeClustering",
|
||||
"Birch",
|
||||
"DBSCAN",
|
||||
"OPTICS",
|
||||
"cluster_optics_dbscan",
|
||||
"cluster_optics_xi",
|
||||
"compute_optics_graph",
|
||||
"KMeans",
|
||||
"BisectingKMeans",
|
||||
"FeatureAgglomeration",
|
||||
"MeanShift",
|
||||
"MiniBatchKMeans",
|
||||
"SpectralClustering",
|
||||
"affinity_propagation",
|
||||
"dbscan",
|
||||
"estimate_bandwidth",
|
||||
"get_bin_seeds",
|
||||
"k_means",
|
||||
"kmeans_plusplus",
|
||||
"linkage_tree",
|
||||
"mean_shift",
|
||||
"spectral_clustering",
|
||||
"ward_tree",
|
||||
"SpectralBiclustering",
|
||||
"SpectralCoclustering",
|
||||
"HDBSCAN",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,601 @@
|
||||
"""Affinity Propagation clustering algorithm."""
|
||||
|
||||
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
|
||||
# Gael Varoquaux gael.varoquaux@normalesup.org
|
||||
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
|
||||
from .._config import config_context
|
||||
from ..base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..metrics import euclidean_distances, pairwise_distances_argmin
|
||||
from ..utils import check_random_state
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.validation import check_is_fitted
|
||||
|
||||
|
||||
def _equal_similarities_and_preferences(S, preference):
|
||||
def all_equal_preferences():
|
||||
return np.all(preference == preference.flat[0])
|
||||
|
||||
def all_equal_similarities():
|
||||
# Create mask to ignore diagonal of S
|
||||
mask = np.ones(S.shape, dtype=bool)
|
||||
np.fill_diagonal(mask, 0)
|
||||
|
||||
return np.all(S[mask].flat == S[mask].flat[0])
|
||||
|
||||
return all_equal_preferences() and all_equal_similarities()
|
||||
|
||||
|
||||
def _affinity_propagation(
|
||||
S,
|
||||
*,
|
||||
preference,
|
||||
convergence_iter,
|
||||
max_iter,
|
||||
damping,
|
||||
verbose,
|
||||
return_n_iter,
|
||||
random_state,
|
||||
):
|
||||
"""Main affinity propagation algorithm."""
|
||||
n_samples = S.shape[0]
|
||||
if n_samples == 1 or _equal_similarities_and_preferences(S, preference):
|
||||
# It makes no sense to run the algorithm in this case, so return 1 or
|
||||
# n_samples clusters, depending on preferences
|
||||
warnings.warn(
|
||||
"All samples have mutually equal similarities. "
|
||||
"Returning arbitrary cluster center(s)."
|
||||
)
|
||||
if preference.flat[0] > S.flat[n_samples - 1]:
|
||||
return (
|
||||
(np.arange(n_samples), np.arange(n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.arange(n_samples), np.arange(n_samples))
|
||||
)
|
||||
else:
|
||||
return (
|
||||
(np.array([0]), np.array([0] * n_samples), 0)
|
||||
if return_n_iter
|
||||
else (np.array([0]), np.array([0] * n_samples))
|
||||
)
|
||||
|
||||
# Place preference on the diagonal of S
|
||||
S.flat[:: (n_samples + 1)] = preference
|
||||
|
||||
A = np.zeros((n_samples, n_samples))
|
||||
R = np.zeros((n_samples, n_samples)) # Initialize messages
|
||||
# Intermediate results
|
||||
tmp = np.zeros((n_samples, n_samples))
|
||||
|
||||
# Remove degeneracies
|
||||
S += (
|
||||
np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100
|
||||
) * random_state.standard_normal(size=(n_samples, n_samples))
|
||||
|
||||
# Execute parallel affinity propagation updates
|
||||
e = np.zeros((n_samples, convergence_iter))
|
||||
|
||||
ind = np.arange(n_samples)
|
||||
|
||||
for it in range(max_iter):
|
||||
# tmp = A + S; compute responsibilities
|
||||
np.add(A, S, tmp)
|
||||
I = np.argmax(tmp, axis=1)
|
||||
Y = tmp[ind, I] # np.max(A + S, axis=1)
|
||||
tmp[ind, I] = -np.inf
|
||||
Y2 = np.max(tmp, axis=1)
|
||||
|
||||
# tmp = Rnew
|
||||
np.subtract(S, Y[:, None], tmp)
|
||||
tmp[ind, I] = S[ind, I] - Y2
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
R *= damping
|
||||
R += tmp
|
||||
|
||||
# tmp = Rp; compute availabilities
|
||||
np.maximum(R, 0, tmp)
|
||||
tmp.flat[:: n_samples + 1] = R.flat[:: n_samples + 1]
|
||||
|
||||
# tmp = -Anew
|
||||
tmp -= np.sum(tmp, axis=0)
|
||||
dA = np.diag(tmp).copy()
|
||||
tmp.clip(0, np.inf, tmp)
|
||||
tmp.flat[:: n_samples + 1] = dA
|
||||
|
||||
# Damping
|
||||
tmp *= 1 - damping
|
||||
A *= damping
|
||||
A -= tmp
|
||||
|
||||
# Check for convergence
|
||||
E = (np.diag(A) + np.diag(R)) > 0
|
||||
e[:, it % convergence_iter] = E
|
||||
K = np.sum(E, axis=0)
|
||||
|
||||
if it >= convergence_iter:
|
||||
se = np.sum(e, axis=1)
|
||||
unconverged = np.sum((se == convergence_iter) + (se == 0)) != n_samples
|
||||
if (not unconverged and (K > 0)) or (it == max_iter):
|
||||
never_converged = False
|
||||
if verbose:
|
||||
print("Converged after %d iterations." % it)
|
||||
break
|
||||
else:
|
||||
never_converged = True
|
||||
if verbose:
|
||||
print("Did not converge")
|
||||
|
||||
I = np.flatnonzero(E)
|
||||
K = I.size # Identify exemplars
|
||||
|
||||
if K > 0:
|
||||
if never_converged:
|
||||
warnings.warn(
|
||||
(
|
||||
"Affinity propagation did not converge, this model "
|
||||
"may return degenerate cluster centers and labels."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K) # Identify clusters
|
||||
# Refine the final set of exemplars and clusters and return results
|
||||
for k in range(K):
|
||||
ii = np.where(c == k)[0]
|
||||
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
|
||||
I[k] = ii[j]
|
||||
|
||||
c = np.argmax(S[:, I], axis=1)
|
||||
c[I] = np.arange(K)
|
||||
labels = I[c]
|
||||
# Reduce labels to a sorted, gapless, list
|
||||
cluster_centers_indices = np.unique(labels)
|
||||
labels = np.searchsorted(cluster_centers_indices, labels)
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
"Affinity propagation did not converge and this model "
|
||||
"will not have any cluster centers."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
labels = np.array([-1] * n_samples)
|
||||
cluster_centers_indices = []
|
||||
|
||||
if return_n_iter:
|
||||
return cluster_centers_indices, labels, it + 1
|
||||
else:
|
||||
return cluster_centers_indices, labels
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Public API
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"S": ["array-like"],
|
||||
"return_n_iter": ["boolean"],
|
||||
},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def affinity_propagation(
|
||||
S,
|
||||
*,
|
||||
preference=None,
|
||||
convergence_iter=15,
|
||||
max_iter=200,
|
||||
damping=0.5,
|
||||
copy=True,
|
||||
verbose=False,
|
||||
return_n_iter=False,
|
||||
random_state=None,
|
||||
):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
S : array-like of shape (n_samples, n_samples)
|
||||
Matrix of similarities between points.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number of
|
||||
exemplars, i.e. of clusters, is influenced by the input preferences
|
||||
value. If the preferences are not passed as arguments, they will be
|
||||
set to the median of the input similarities (resulting in a moderate
|
||||
number of clusters). For a smaller amount of clusters, this can be set
|
||||
to the minimum value of the similarities.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
damping : float, default=0.5
|
||||
Damping factor between 0.5 and 1.
|
||||
|
||||
copy : bool, default=True
|
||||
If copy is False, the affinity matrix is modified inplace by the
|
||||
algorithm, for memory efficiency.
|
||||
|
||||
verbose : bool, default=False
|
||||
The verbosity level.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cluster_centers_indices : ndarray of shape (n_clusters,)
|
||||
Index of clusters centers.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point.
|
||||
|
||||
n_iter : int
|
||||
Number of iterations run. Returned only if `return_n_iter` is
|
||||
set to True.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
When the algorithm does not converge, it will still return a arrays of
|
||||
``cluster_center_indices`` and labels if there are any exemplars/clusters,
|
||||
however they may be degenerate and should be used with caution.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, a single cluster center
|
||||
and label ``0`` for every sample will be returned. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.cluster import affinity_propagation
|
||||
>>> from sklearn.metrics.pairwise import euclidean_distances
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> S = -euclidean_distances(X, squared=True)
|
||||
>>> cluster_centers_indices, labels = affinity_propagation(S, random_state=0)
|
||||
>>> cluster_centers_indices
|
||||
array([0, 3])
|
||||
>>> labels
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
estimator = AffinityPropagation(
|
||||
damping=damping,
|
||||
max_iter=max_iter,
|
||||
convergence_iter=convergence_iter,
|
||||
copy=copy,
|
||||
preference=preference,
|
||||
affinity="precomputed",
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
).fit(S)
|
||||
|
||||
if return_n_iter:
|
||||
return estimator.cluster_centers_indices_, estimator.labels_, estimator.n_iter_
|
||||
return estimator.cluster_centers_indices_, estimator.labels_
|
||||
|
||||
|
||||
class AffinityPropagation(ClusterMixin, BaseEstimator):
|
||||
"""Perform Affinity Propagation Clustering of data.
|
||||
|
||||
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
damping : float, default=0.5
|
||||
Damping factor in the range `[0.5, 1.0)` is the extent to
|
||||
which the current value is maintained relative to
|
||||
incoming values (weighted 1 - damping). This in order
|
||||
to avoid numerical oscillations when updating these
|
||||
values (messages).
|
||||
|
||||
max_iter : int, default=200
|
||||
Maximum number of iterations.
|
||||
|
||||
convergence_iter : int, default=15
|
||||
Number of iterations with no change in the number
|
||||
of estimated clusters that stops the convergence.
|
||||
|
||||
copy : bool, default=True
|
||||
Make a copy of input data.
|
||||
|
||||
preference : array-like of shape (n_samples,) or float, default=None
|
||||
Preferences for each point - points with larger values of
|
||||
preferences are more likely to be chosen as exemplars. The number
|
||||
of exemplars, ie of clusters, is influenced by the input
|
||||
preferences value. If the preferences are not passed as arguments,
|
||||
they will be set to the median of the input similarities.
|
||||
|
||||
affinity : {'euclidean', 'precomputed'}, default='euclidean'
|
||||
Which affinity to use. At the moment 'precomputed' and
|
||||
``euclidean`` are supported. 'euclidean' uses the
|
||||
negative squared euclidean distance between points.
|
||||
|
||||
verbose : bool, default=False
|
||||
Whether to be verbose.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Pseudo-random number generator to control the starting state.
|
||||
Use an int for reproducible results across function calls.
|
||||
See the :term:`Glossary <random_state>`.
|
||||
|
||||
.. versionadded:: 0.23
|
||||
this parameter was previously hardcoded as 0.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_indices_ : ndarray of shape (n_clusters,)
|
||||
Indices of cluster centers.
|
||||
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Cluster centers (if affinity != ``precomputed``).
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
affinity_matrix_ : ndarray of shape (n_samples, n_samples)
|
||||
Stores the affinity matrix used in ``fit``.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations taken to converge.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
AgglomerativeClustering : Recursively merges the pair of
|
||||
clusters that minimally increases a given linkage distance.
|
||||
FeatureAgglomeration : Similar to AgglomerativeClustering,
|
||||
but recursively merges features instead of samples.
|
||||
KMeans : K-Means clustering.
|
||||
MiniBatchKMeans : Mini-Batch K-Means clustering.
|
||||
MeanShift : Mean shift clustering using a flat kernel.
|
||||
SpectralClustering : Apply clustering to a projection
|
||||
of the normalized Laplacian.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||||
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||||
|
||||
The algorithmic complexity of affinity propagation is quadratic
|
||||
in the number of points.
|
||||
|
||||
When the algorithm does not converge, it will still return a arrays of
|
||||
``cluster_center_indices`` and labels if there are any exemplars/clusters,
|
||||
however they may be degenerate and should be used with caution.
|
||||
|
||||
When ``fit`` does not converge, ``cluster_centers_`` is still populated
|
||||
however it may be degenerate. In such a case, proceed with caution.
|
||||
If ``fit`` does not converge and fails to produce any ``cluster_centers_``
|
||||
then ``predict`` will label every sample as ``-1``.
|
||||
|
||||
When all training samples have equal similarities and equal preferences,
|
||||
the assignment of cluster centers and labels depends on the preference.
|
||||
If the preference is smaller than the similarities, ``fit`` will result in
|
||||
a single cluster center and label ``0`` for every sample. Otherwise, every
|
||||
training sample becomes its own cluster center and is assigned a unique
|
||||
label.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||||
Between Data Points", Science Feb. 2007
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import AffinityPropagation
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [1, 4], [1, 0],
|
||||
... [4, 2], [4, 4], [4, 0]])
|
||||
>>> clustering = AffinityPropagation(random_state=5).fit(X)
|
||||
>>> clustering
|
||||
AffinityPropagation(random_state=5)
|
||||
>>> clustering.labels_
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
>>> clustering.predict([[0, 0], [4, 4]])
|
||||
array([0, 1])
|
||||
>>> clustering.cluster_centers_
|
||||
array([[1, 2],
|
||||
[4, 2]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"damping": [Interval(Real, 0.5, 1.0, closed="left")],
|
||||
"max_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"convergence_iter": [Interval(Integral, 1, None, closed="left")],
|
||||
"copy": ["boolean"],
|
||||
"preference": [
|
||||
"array-like",
|
||||
Interval(Real, None, None, closed="neither"),
|
||||
None,
|
||||
],
|
||||
"affinity": [StrOptions({"euclidean", "precomputed"})],
|
||||
"verbose": ["verbose"],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
damping=0.5,
|
||||
max_iter=200,
|
||||
convergence_iter=15,
|
||||
copy=True,
|
||||
preference=None,
|
||||
affinity="euclidean",
|
||||
verbose=False,
|
||||
random_state=None,
|
||||
):
|
||||
self.damping = damping
|
||||
self.max_iter = max_iter
|
||||
self.convergence_iter = convergence_iter
|
||||
self.copy = copy
|
||||
self.verbose = verbose
|
||||
self.preference = preference
|
||||
self.affinity = affinity
|
||||
self.random_state = random_state
|
||||
|
||||
def _more_tags(self):
|
||||
return {"pairwise": self.affinity == "precomputed"}
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the clustering from features, or affinity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
array-like of shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Returns the instance itself.
|
||||
"""
|
||||
if self.affinity == "precomputed":
|
||||
X = self._validate_data(X, copy=self.copy, force_writeable=True)
|
||||
self.affinity_matrix_ = X
|
||||
else: # self.affinity == "euclidean"
|
||||
X = self._validate_data(X, accept_sparse="csr")
|
||||
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
|
||||
|
||||
if self.affinity_matrix_.shape[0] != self.affinity_matrix_.shape[1]:
|
||||
raise ValueError(
|
||||
"The matrix of similarities must be a square array. "
|
||||
f"Got {self.affinity_matrix_.shape} instead."
|
||||
)
|
||||
|
||||
if self.preference is None:
|
||||
preference = np.median(self.affinity_matrix_)
|
||||
else:
|
||||
preference = self.preference
|
||||
preference = np.asarray(preference)
|
||||
|
||||
random_state = check_random_state(self.random_state)
|
||||
|
||||
(
|
||||
self.cluster_centers_indices_,
|
||||
self.labels_,
|
||||
self.n_iter_,
|
||||
) = _affinity_propagation(
|
||||
self.affinity_matrix_,
|
||||
max_iter=self.max_iter,
|
||||
convergence_iter=self.convergence_iter,
|
||||
preference=preference,
|
||||
damping=self.damping,
|
||||
verbose=self.verbose,
|
||||
return_n_iter=True,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
if self.affinity != "precomputed":
|
||||
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the closest cluster each sample in X belongs to.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
New data to predict. If a sparse matrix is provided, it will be
|
||||
converted into a sparse ``csr_matrix``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(X, reset=False, accept_sparse="csr")
|
||||
if not hasattr(self, "cluster_centers_"):
|
||||
raise ValueError(
|
||||
"Predict method is not supported when affinity='precomputed'."
|
||||
)
|
||||
|
||||
if self.cluster_centers_.shape[0] > 0:
|
||||
with config_context(assume_finite=True):
|
||||
return pairwise_distances_argmin(X, self.cluster_centers_)
|
||||
else:
|
||||
warnings.warn(
|
||||
(
|
||||
"This model does not have any cluster centers "
|
||||
"because affinity propagation did not converge. "
|
||||
"Labeling every sample as '-1'."
|
||||
),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
return np.array([-1] * X.shape[0])
|
||||
|
||||
def fit_predict(self, X, y=None):
|
||||
"""Fit clustering from features/affinity matrix; return cluster labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
array-like of shape (n_samples, n_samples)
|
||||
Training instances to cluster, or similarities / affinities between
|
||||
instances if ``affinity='precomputed'``. If a sparse feature matrix
|
||||
is provided, it will be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels.
|
||||
"""
|
||||
return super().fit_predict(X, y)
|
||||
1346
.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py
Normal file
1346
.venv/lib/python3.12/site-packages/sklearn/cluster/_agglomerative.py
Normal file
File diff suppressed because it is too large
Load Diff
627
.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py
Normal file
627
.venv/lib/python3.12/site-packages/sklearn/cluster/_bicluster.py
Normal file
@@ -0,0 +1,627 @@
|
||||
"""Spectral biclustering algorithms."""
|
||||
|
||||
# Authors : Kemal Eren
|
||||
# License: BSD 3 clause
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from numbers import Integral
|
||||
|
||||
import numpy as np
|
||||
from scipy.linalg import norm
|
||||
from scipy.sparse import dia_matrix, issparse
|
||||
from scipy.sparse.linalg import eigsh, svds
|
||||
|
||||
from ..base import BaseEstimator, BiclusterMixin, _fit_context
|
||||
from ..utils import check_random_state, check_scalar
|
||||
from ..utils._param_validation import Interval, StrOptions
|
||||
from ..utils.extmath import make_nonnegative, randomized_svd, safe_sparse_dot
|
||||
from ..utils.validation import assert_all_finite
|
||||
from ._kmeans import KMeans, MiniBatchKMeans
|
||||
|
||||
__all__ = ["SpectralCoclustering", "SpectralBiclustering"]
|
||||
|
||||
|
||||
def _scale_normalize(X):
|
||||
"""Normalize ``X`` by scaling rows and columns independently.
|
||||
|
||||
Returns the normalized matrix and the row and column scaling
|
||||
factors.
|
||||
"""
|
||||
X = make_nonnegative(X)
|
||||
row_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=1))).squeeze()
|
||||
col_diag = np.asarray(1.0 / np.sqrt(X.sum(axis=0))).squeeze()
|
||||
row_diag = np.where(np.isnan(row_diag), 0, row_diag)
|
||||
col_diag = np.where(np.isnan(col_diag), 0, col_diag)
|
||||
if issparse(X):
|
||||
n_rows, n_cols = X.shape
|
||||
r = dia_matrix((row_diag, [0]), shape=(n_rows, n_rows))
|
||||
c = dia_matrix((col_diag, [0]), shape=(n_cols, n_cols))
|
||||
an = r * X * c
|
||||
else:
|
||||
an = row_diag[:, np.newaxis] * X * col_diag
|
||||
return an, row_diag, col_diag
|
||||
|
||||
|
||||
def _bistochastic_normalize(X, max_iter=1000, tol=1e-5):
|
||||
"""Normalize rows and columns of ``X`` simultaneously so that all
|
||||
rows sum to one constant and all columns sum to a different
|
||||
constant.
|
||||
"""
|
||||
# According to paper, this can also be done more efficiently with
|
||||
# deviation reduction and balancing algorithms.
|
||||
X = make_nonnegative(X)
|
||||
X_scaled = X
|
||||
for _ in range(max_iter):
|
||||
X_new, _, _ = _scale_normalize(X_scaled)
|
||||
if issparse(X):
|
||||
dist = norm(X_scaled.data - X.data)
|
||||
else:
|
||||
dist = norm(X_scaled - X_new)
|
||||
X_scaled = X_new
|
||||
if dist is not None and dist < tol:
|
||||
break
|
||||
return X_scaled
|
||||
|
||||
|
||||
def _log_normalize(X):
|
||||
"""Normalize ``X`` according to Kluger's log-interactions scheme."""
|
||||
X = make_nonnegative(X, min_value=1)
|
||||
if issparse(X):
|
||||
raise ValueError(
|
||||
"Cannot compute log of a sparse matrix,"
|
||||
" because log(x) diverges to -infinity as x"
|
||||
" goes to 0."
|
||||
)
|
||||
L = np.log(X)
|
||||
row_avg = L.mean(axis=1)[:, np.newaxis]
|
||||
col_avg = L.mean(axis=0)
|
||||
avg = L.mean()
|
||||
return L - row_avg - col_avg + avg
|
||||
|
||||
|
||||
class BaseSpectral(BiclusterMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for spectral biclustering."""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"svd_method": [StrOptions({"randomized", "arpack"})],
|
||||
"n_svd_vecs": [Interval(Integral, 0, None, closed="left"), None],
|
||||
"mini_batch": ["boolean"],
|
||||
"init": [StrOptions({"k-means++", "random"}), np.ndarray],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"random_state": ["random_state"],
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
self.n_clusters = n_clusters
|
||||
self.svd_method = svd_method
|
||||
self.n_svd_vecs = n_svd_vecs
|
||||
self.mini_batch = mini_batch
|
||||
self.init = init
|
||||
self.n_init = n_init
|
||||
self.random_state = random_state
|
||||
|
||||
@abstractmethod
|
||||
def _check_parameters(self, n_samples):
|
||||
"""Validate parameters depending on the input data."""
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""Create a biclustering for X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
SpectralBiclustering instance.
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
|
||||
self._check_parameters(X.shape[0])
|
||||
self._fit(X)
|
||||
return self
|
||||
|
||||
def _svd(self, array, n_components, n_discard):
|
||||
"""Returns first `n_components` left and right singular
|
||||
vectors u and v, discarding the first `n_discard`.
|
||||
"""
|
||||
if self.svd_method == "randomized":
|
||||
kwargs = {}
|
||||
if self.n_svd_vecs is not None:
|
||||
kwargs["n_oversamples"] = self.n_svd_vecs
|
||||
u, _, vt = randomized_svd(
|
||||
array, n_components, random_state=self.random_state, **kwargs
|
||||
)
|
||||
|
||||
elif self.svd_method == "arpack":
|
||||
u, _, vt = svds(array, k=n_components, ncv=self.n_svd_vecs)
|
||||
if np.any(np.isnan(vt)):
|
||||
# some eigenvalues of A * A.T are negative, causing
|
||||
# sqrt() to be np.nan. This causes some vectors in vt
|
||||
# to be np.nan.
|
||||
A = safe_sparse_dot(array.T, array)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, v = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
vt = v.T
|
||||
if np.any(np.isnan(u)):
|
||||
A = safe_sparse_dot(array, array.T)
|
||||
random_state = check_random_state(self.random_state)
|
||||
# initialize with [-1,1] as in ARPACK
|
||||
v0 = random_state.uniform(-1, 1, A.shape[0])
|
||||
_, u = eigsh(A, ncv=self.n_svd_vecs, v0=v0)
|
||||
|
||||
assert_all_finite(u)
|
||||
assert_all_finite(vt)
|
||||
u = u[:, n_discard:]
|
||||
vt = vt[n_discard:]
|
||||
return u, vt.T
|
||||
|
||||
def _k_means(self, data, n_clusters):
|
||||
if self.mini_batch:
|
||||
model = MiniBatchKMeans(
|
||||
n_clusters,
|
||||
init=self.init,
|
||||
n_init=self.n_init,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
else:
|
||||
model = KMeans(
|
||||
n_clusters,
|
||||
init=self.init,
|
||||
n_init=self.n_init,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
model.fit(data)
|
||||
centroid = model.cluster_centers_
|
||||
labels = model.labels_
|
||||
return centroid, labels
|
||||
|
||||
def _more_tags(self):
|
||||
return {
|
||||
"_xfail_checks": {
|
||||
"check_estimators_dtypes": "raises nan error",
|
||||
"check_fit2d_1sample": "_scale_normalize fails",
|
||||
"check_fit2d_1feature": "raises apply_along_axis error",
|
||||
"check_estimator_sparse_matrix": "does not fail gracefully",
|
||||
"check_estimator_sparse_array": "does not fail gracefully",
|
||||
"check_methods_subset_invariance": "empty array passed inside",
|
||||
"check_dont_overwrite_parameters": "empty array passed inside",
|
||||
"check_fit2d_predict1d": "empty array passed inside",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SpectralCoclustering(BaseSpectral):
|
||||
"""Spectral Co-Clustering algorithm (Dhillon, 2001).
|
||||
|
||||
Clusters rows and columns of an array `X` to solve the relaxed
|
||||
normalized cut of the bipartite graph created from `X` as follows:
|
||||
the edge between row vertex `i` and column vertex `j` has weight
|
||||
`X[i, j]`.
|
||||
|
||||
The resulting bicluster structure is block-diagonal, since each
|
||||
row and each column belongs to exactly one bicluster.
|
||||
|
||||
Supports sparse matrices, as long as they are nonnegative.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_coclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=3
|
||||
The number of biclusters to find.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', use
|
||||
:func:`sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', use
|
||||
:func:`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random'}, or ndarray of shape \
|
||||
(n_clusters, n_features), default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
The bicluster label of each row.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
The bicluster label of each column.
|
||||
|
||||
biclusters_ : tuple of two ndarrays
|
||||
The tuple contains the `rows_` and `columns_` arrays.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralBiclustering : Partitions rows and columns under the assumption
|
||||
that the data has an underlying checkerboard structure.
|
||||
|
||||
References
|
||||
----------
|
||||
* :doi:`Dhillon, Inderjit S, 2001. Co-clustering documents and words using
|
||||
bipartite spectral graph partitioning.
|
||||
<10.1145/502512.502550>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralCoclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralCoclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_ #doctest: +SKIP
|
||||
array([0, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_ #doctest: +SKIP
|
||||
array([0, 0], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralCoclustering(n_clusters=2, random_state=0)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSpectral._parameter_constraints,
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
*,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
|
||||
)
|
||||
|
||||
def _check_parameters(self, n_samples):
|
||||
if self.n_clusters > n_samples:
|
||||
raise ValueError(
|
||||
f"n_clusters should be <= n_samples={n_samples}. Got"
|
||||
f" {self.n_clusters} instead."
|
||||
)
|
||||
|
||||
def _fit(self, X):
|
||||
normalized_data, row_diag, col_diag = _scale_normalize(X)
|
||||
n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard=1)
|
||||
z = np.vstack((row_diag[:, np.newaxis] * u, col_diag[:, np.newaxis] * v))
|
||||
|
||||
_, labels = self._k_means(z, self.n_clusters)
|
||||
|
||||
n_rows = X.shape[0]
|
||||
self.row_labels_ = labels[:n_rows]
|
||||
self.column_labels_ = labels[n_rows:]
|
||||
|
||||
self.rows_ = np.vstack([self.row_labels_ == c for c in range(self.n_clusters)])
|
||||
self.columns_ = np.vstack(
|
||||
[self.column_labels_ == c for c in range(self.n_clusters)]
|
||||
)
|
||||
|
||||
|
||||
class SpectralBiclustering(BaseSpectral):
|
||||
"""Spectral biclustering (Kluger, 2003).
|
||||
|
||||
Partitions rows and columns under the assumption that the data has
|
||||
an underlying checkerboard structure. For instance, if there are
|
||||
two row partitions and three column partitions, each row will
|
||||
belong to three biclusters, and each column will belong to two
|
||||
biclusters. The outer product of the corresponding row and column
|
||||
label vectors gives this checkerboard structure.
|
||||
|
||||
Read more in the :ref:`User Guide <spectral_biclustering>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int or tuple (n_row_clusters, n_column_clusters), default=3
|
||||
The number of row and column clusters in the checkerboard
|
||||
structure.
|
||||
|
||||
method : {'bistochastic', 'scale', 'log'}, default='bistochastic'
|
||||
Method of normalizing and converting singular vectors into
|
||||
biclusters. May be one of 'scale', 'bistochastic', or 'log'.
|
||||
The authors recommend using 'log'. If the data is sparse,
|
||||
however, log normalization will not work, which is why the
|
||||
default is 'bistochastic'.
|
||||
|
||||
.. warning::
|
||||
if `method='log'`, the data must not be sparse.
|
||||
|
||||
n_components : int, default=6
|
||||
Number of singular vectors to check.
|
||||
|
||||
n_best : int, default=3
|
||||
Number of best singular vectors to which to project the data
|
||||
for clustering.
|
||||
|
||||
svd_method : {'randomized', 'arpack'}, default='randomized'
|
||||
Selects the algorithm for finding singular vectors. May be
|
||||
'randomized' or 'arpack'. If 'randomized', uses
|
||||
:func:`~sklearn.utils.extmath.randomized_svd`, which may be faster
|
||||
for large matrices. If 'arpack', uses
|
||||
`scipy.sparse.linalg.svds`, which is more accurate, but
|
||||
possibly slower in some cases.
|
||||
|
||||
n_svd_vecs : int, default=None
|
||||
Number of vectors to use in calculating the SVD. Corresponds
|
||||
to `ncv` when `svd_method=arpack` and `n_oversamples` when
|
||||
`svd_method` is 'randomized`.
|
||||
|
||||
mini_batch : bool, default=False
|
||||
Whether to use mini-batch k-means, which is faster but may get
|
||||
different results.
|
||||
|
||||
init : {'k-means++', 'random'} or ndarray of shape (n_clusters, n_features), \
|
||||
default='k-means++'
|
||||
Method for initialization of k-means algorithm; defaults to
|
||||
'k-means++'.
|
||||
|
||||
n_init : int, default=10
|
||||
Number of random initializations that are tried with the
|
||||
k-means algorithm.
|
||||
|
||||
If mini-batch k-means is used, the best initialization is
|
||||
chosen and the algorithm runs once. Otherwise, the algorithm
|
||||
is run for each initialization and the best solution chosen.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Used for randomizing the singular value decomposition and the k-means
|
||||
initialization. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
rows_ : array-like of shape (n_row_clusters, n_rows)
|
||||
Results of the clustering. `rows[i, r]` is True if
|
||||
cluster `i` contains row `r`. Available only after calling ``fit``.
|
||||
|
||||
columns_ : array-like of shape (n_column_clusters, n_columns)
|
||||
Results of the clustering, like `rows`.
|
||||
|
||||
row_labels_ : array-like of shape (n_rows,)
|
||||
Row partition labels.
|
||||
|
||||
column_labels_ : array-like of shape (n_cols,)
|
||||
Column partition labels.
|
||||
|
||||
biclusters_ : tuple of two ndarrays
|
||||
The tuple contains the `rows_` and `columns_` arrays.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
SpectralCoclustering : Spectral Co-Clustering algorithm (Dhillon, 2001).
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
* :doi:`Kluger, Yuval, et. al., 2003. Spectral biclustering of microarray
|
||||
data: coclustering genes and conditions.
|
||||
<10.1101/gr.648603>`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import SpectralBiclustering
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [2, 1], [1, 0],
|
||||
... [4, 7], [3, 5], [3, 6]])
|
||||
>>> clustering = SpectralBiclustering(n_clusters=2, random_state=0).fit(X)
|
||||
>>> clustering.row_labels_
|
||||
array([1, 1, 1, 0, 0, 0], dtype=int32)
|
||||
>>> clustering.column_labels_
|
||||
array([1, 0], dtype=int32)
|
||||
>>> clustering
|
||||
SpectralBiclustering(n_clusters=2, random_state=0)
|
||||
|
||||
For a more detailed example, see
|
||||
:ref:`sphx_glr_auto_examples_bicluster_plot_spectral_biclustering.py`
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**BaseSpectral._parameter_constraints,
|
||||
"n_clusters": [Interval(Integral, 1, None, closed="left"), tuple],
|
||||
"method": [StrOptions({"bistochastic", "scale", "log"})],
|
||||
"n_components": [Interval(Integral, 1, None, closed="left")],
|
||||
"n_best": [Interval(Integral, 1, None, closed="left")],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=3,
|
||||
*,
|
||||
method="bistochastic",
|
||||
n_components=6,
|
||||
n_best=3,
|
||||
svd_method="randomized",
|
||||
n_svd_vecs=None,
|
||||
mini_batch=False,
|
||||
init="k-means++",
|
||||
n_init=10,
|
||||
random_state=None,
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters, svd_method, n_svd_vecs, mini_batch, init, n_init, random_state
|
||||
)
|
||||
self.method = method
|
||||
self.n_components = n_components
|
||||
self.n_best = n_best
|
||||
|
||||
def _check_parameters(self, n_samples):
|
||||
if isinstance(self.n_clusters, Integral):
|
||||
if self.n_clusters > n_samples:
|
||||
raise ValueError(
|
||||
f"n_clusters should be <= n_samples={n_samples}. Got"
|
||||
f" {self.n_clusters} instead."
|
||||
)
|
||||
else: # tuple
|
||||
try:
|
||||
n_row_clusters, n_column_clusters = self.n_clusters
|
||||
check_scalar(
|
||||
n_row_clusters,
|
||||
"n_row_clusters",
|
||||
target_type=Integral,
|
||||
min_val=1,
|
||||
max_val=n_samples,
|
||||
)
|
||||
check_scalar(
|
||||
n_column_clusters,
|
||||
"n_column_clusters",
|
||||
target_type=Integral,
|
||||
min_val=1,
|
||||
max_val=n_samples,
|
||||
)
|
||||
except (ValueError, TypeError) as e:
|
||||
raise ValueError(
|
||||
"Incorrect parameter n_clusters has value:"
|
||||
f" {self.n_clusters}. It should either be a single integer"
|
||||
" or an iterable with two integers:"
|
||||
" (n_row_clusters, n_column_clusters)"
|
||||
" And the values are should be in the"
|
||||
" range: (1, n_samples)"
|
||||
) from e
|
||||
|
||||
if self.n_best > self.n_components:
|
||||
raise ValueError(
|
||||
f"n_best={self.n_best} must be <= n_components={self.n_components}."
|
||||
)
|
||||
|
||||
def _fit(self, X):
|
||||
n_sv = self.n_components
|
||||
if self.method == "bistochastic":
|
||||
normalized_data = _bistochastic_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == "scale":
|
||||
normalized_data, _, _ = _scale_normalize(X)
|
||||
n_sv += 1
|
||||
elif self.method == "log":
|
||||
normalized_data = _log_normalize(X)
|
||||
n_discard = 0 if self.method == "log" else 1
|
||||
u, v = self._svd(normalized_data, n_sv, n_discard)
|
||||
ut = u.T
|
||||
vt = v.T
|
||||
|
||||
try:
|
||||
n_row_clusters, n_col_clusters = self.n_clusters
|
||||
except TypeError:
|
||||
n_row_clusters = n_col_clusters = self.n_clusters
|
||||
|
||||
best_ut = self._fit_best_piecewise(ut, self.n_best, n_row_clusters)
|
||||
|
||||
best_vt = self._fit_best_piecewise(vt, self.n_best, n_col_clusters)
|
||||
|
||||
self.row_labels_ = self._project_and_cluster(X, best_vt.T, n_row_clusters)
|
||||
|
||||
self.column_labels_ = self._project_and_cluster(X.T, best_ut.T, n_col_clusters)
|
||||
|
||||
self.rows_ = np.vstack(
|
||||
[
|
||||
self.row_labels_ == label
|
||||
for label in range(n_row_clusters)
|
||||
for _ in range(n_col_clusters)
|
||||
]
|
||||
)
|
||||
self.columns_ = np.vstack(
|
||||
[
|
||||
self.column_labels_ == label
|
||||
for _ in range(n_row_clusters)
|
||||
for label in range(n_col_clusters)
|
||||
]
|
||||
)
|
||||
|
||||
def _fit_best_piecewise(self, vectors, n_best, n_clusters):
|
||||
"""Find the ``n_best`` vectors that are best approximated by piecewise
|
||||
constant vectors.
|
||||
|
||||
The piecewise vectors are found by k-means; the best is chosen
|
||||
according to Euclidean distance.
|
||||
|
||||
"""
|
||||
|
||||
def make_piecewise(v):
|
||||
centroid, labels = self._k_means(v.reshape(-1, 1), n_clusters)
|
||||
return centroid[labels].ravel()
|
||||
|
||||
piecewise_vectors = np.apply_along_axis(make_piecewise, axis=1, arr=vectors)
|
||||
dists = np.apply_along_axis(norm, axis=1, arr=(vectors - piecewise_vectors))
|
||||
result = vectors[np.argsort(dists)[:n_best]]
|
||||
return result
|
||||
|
||||
def _project_and_cluster(self, data, vectors, n_clusters):
|
||||
"""Project ``data`` to ``vectors`` and cluster the result."""
|
||||
projected = safe_sparse_dot(data, vectors)
|
||||
_, labels = self._k_means(projected, n_clusters)
|
||||
return labels
|
||||
741
.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py
Normal file
741
.venv/lib/python3.12/site-packages/sklearn/cluster/_birch.py
Normal file
@@ -0,0 +1,741 @@
|
||||
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from math import sqrt
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from .._config import config_context
|
||||
from ..base import (
|
||||
BaseEstimator,
|
||||
ClassNamePrefixFeaturesOutMixin,
|
||||
ClusterMixin,
|
||||
TransformerMixin,
|
||||
_fit_context,
|
||||
)
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..metrics import pairwise_distances_argmin
|
||||
from ..metrics.pairwise import euclidean_distances
|
||||
from ..utils._param_validation import Interval
|
||||
from ..utils.extmath import row_norms
|
||||
from ..utils.validation import check_is_fitted
|
||||
from . import AgglomerativeClustering
|
||||
|
||||
|
||||
def _iterate_sparse_X(X):
|
||||
"""This little hack returns a densified row when iterating over a sparse
|
||||
matrix, instead of constructing a sparse matrix for every row that is
|
||||
expensive.
|
||||
"""
|
||||
n_samples = X.shape[0]
|
||||
X_indices = X.indices
|
||||
X_data = X.data
|
||||
X_indptr = X.indptr
|
||||
|
||||
for i in range(n_samples):
|
||||
row = np.zeros(X.shape[1])
|
||||
startptr, endptr = X_indptr[i], X_indptr[i + 1]
|
||||
nonzero_indices = X_indices[startptr:endptr]
|
||||
row[nonzero_indices] = X_data[startptr:endptr]
|
||||
yield row
|
||||
|
||||
|
||||
def _split_node(node, threshold, branching_factor):
|
||||
"""The node has to be split if there is no place for a new subcluster
|
||||
in the node.
|
||||
1. Two empty nodes and two empty subclusters are initialized.
|
||||
2. The pair of distant subclusters are found.
|
||||
3. The properties of the empty subclusters and nodes are updated
|
||||
according to the nearest distance between the subclusters to the
|
||||
pair of distant subclusters.
|
||||
4. The two nodes are set as children to the two subclusters.
|
||||
"""
|
||||
new_subcluster1 = _CFSubcluster()
|
||||
new_subcluster2 = _CFSubcluster()
|
||||
new_node1 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_node2 = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=node.is_leaf,
|
||||
n_features=node.n_features,
|
||||
dtype=node.init_centroids_.dtype,
|
||||
)
|
||||
new_subcluster1.child_ = new_node1
|
||||
new_subcluster2.child_ = new_node2
|
||||
|
||||
if node.is_leaf:
|
||||
if node.prev_leaf_ is not None:
|
||||
node.prev_leaf_.next_leaf_ = new_node1
|
||||
new_node1.prev_leaf_ = node.prev_leaf_
|
||||
new_node1.next_leaf_ = new_node2
|
||||
new_node2.prev_leaf_ = new_node1
|
||||
new_node2.next_leaf_ = node.next_leaf_
|
||||
if node.next_leaf_ is not None:
|
||||
node.next_leaf_.prev_leaf_ = new_node2
|
||||
|
||||
dist = euclidean_distances(
|
||||
node.centroids_, Y_norm_squared=node.squared_norm_, squared=True
|
||||
)
|
||||
n_clusters = dist.shape[0]
|
||||
|
||||
farthest_idx = np.unravel_index(dist.argmax(), (n_clusters, n_clusters))
|
||||
node1_dist, node2_dist = dist[(farthest_idx,)]
|
||||
|
||||
node1_closer = node1_dist < node2_dist
|
||||
# make sure node1 is closest to itself even if all distances are equal.
|
||||
# This can only happen when all node.centroids_ are duplicates leading to all
|
||||
# distances between centroids being zero.
|
||||
node1_closer[farthest_idx[0]] = True
|
||||
|
||||
for idx, subcluster in enumerate(node.subclusters_):
|
||||
if node1_closer[idx]:
|
||||
new_node1.append_subcluster(subcluster)
|
||||
new_subcluster1.update(subcluster)
|
||||
else:
|
||||
new_node2.append_subcluster(subcluster)
|
||||
new_subcluster2.update(subcluster)
|
||||
return new_subcluster1, new_subcluster2
|
||||
|
||||
|
||||
class _CFNode:
|
||||
"""Each node in a CFTree is called a CFNode.
|
||||
|
||||
The CFNode can have a maximum of branching_factor
|
||||
number of CFSubclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float
|
||||
Threshold needed for a new subcluster to enter a CFSubcluster.
|
||||
|
||||
branching_factor : int
|
||||
Maximum number of CF subclusters in each node.
|
||||
|
||||
is_leaf : bool
|
||||
We need to know if the CFNode is a leaf or not, in order to
|
||||
retrieve the final subclusters.
|
||||
|
||||
n_features : int
|
||||
The number of features.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
subclusters_ : list
|
||||
List of subclusters for a particular CFNode.
|
||||
|
||||
prev_leaf_ : _CFNode
|
||||
Useful only if is_leaf is True.
|
||||
|
||||
next_leaf_ : _CFNode
|
||||
next_leaf. Useful only if is_leaf is True.
|
||||
the final subclusters.
|
||||
|
||||
init_centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Manipulate ``init_centroids_`` throughout rather than centroids_ since
|
||||
the centroids are just a view of the ``init_centroids_`` .
|
||||
|
||||
init_sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
manipulate init_sq_norm_ throughout. similar to ``init_centroids_``.
|
||||
|
||||
centroids_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
View of ``init_centroids_``.
|
||||
|
||||
squared_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
View of ``init_sq_norm_``.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *, threshold, branching_factor, is_leaf, n_features, dtype):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.is_leaf = is_leaf
|
||||
self.n_features = n_features
|
||||
|
||||
# The list of subclusters, centroids and squared norms
|
||||
# to manipulate throughout.
|
||||
self.subclusters_ = []
|
||||
self.init_centroids_ = np.zeros((branching_factor + 1, n_features), dtype=dtype)
|
||||
self.init_sq_norm_ = np.zeros((branching_factor + 1), dtype)
|
||||
self.squared_norm_ = []
|
||||
self.prev_leaf_ = None
|
||||
self.next_leaf_ = None
|
||||
|
||||
def append_subcluster(self, subcluster):
|
||||
n_samples = len(self.subclusters_)
|
||||
self.subclusters_.append(subcluster)
|
||||
self.init_centroids_[n_samples] = subcluster.centroid_
|
||||
self.init_sq_norm_[n_samples] = subcluster.sq_norm_
|
||||
|
||||
# Keep centroids and squared norm as views. In this way
|
||||
# if we change init_centroids and init_sq_norm_, it is
|
||||
# sufficient,
|
||||
self.centroids_ = self.init_centroids_[: n_samples + 1, :]
|
||||
self.squared_norm_ = self.init_sq_norm_[: n_samples + 1]
|
||||
|
||||
def update_split_subclusters(self, subcluster, new_subcluster1, new_subcluster2):
|
||||
"""Remove a subcluster from a node and update it with the
|
||||
split subclusters.
|
||||
"""
|
||||
ind = self.subclusters_.index(subcluster)
|
||||
self.subclusters_[ind] = new_subcluster1
|
||||
self.init_centroids_[ind] = new_subcluster1.centroid_
|
||||
self.init_sq_norm_[ind] = new_subcluster1.sq_norm_
|
||||
self.append_subcluster(new_subcluster2)
|
||||
|
||||
def insert_cf_subcluster(self, subcluster):
|
||||
"""Insert a new subcluster into the node."""
|
||||
if not self.subclusters_:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
# We need to find the closest subcluster among all the
|
||||
# subclusters so that we can insert our new subcluster.
|
||||
dist_matrix = np.dot(self.centroids_, subcluster.centroid_)
|
||||
dist_matrix *= -2.0
|
||||
dist_matrix += self.squared_norm_
|
||||
closest_index = np.argmin(dist_matrix)
|
||||
closest_subcluster = self.subclusters_[closest_index]
|
||||
|
||||
# If the subcluster has a child, we need a recursive strategy.
|
||||
if closest_subcluster.child_ is not None:
|
||||
split_child = closest_subcluster.child_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if not split_child:
|
||||
# If it is determined that the child need not be split, we
|
||||
# can just update the closest_subcluster
|
||||
closest_subcluster.update(subcluster)
|
||||
self.init_centroids_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].centroid_
|
||||
self.init_sq_norm_[closest_index] = self.subclusters_[
|
||||
closest_index
|
||||
].sq_norm_
|
||||
return False
|
||||
|
||||
# things not too good. we need to redistribute the subclusters in
|
||||
# our child node, and add a new subcluster in the parent
|
||||
# subcluster to accommodate the new child.
|
||||
else:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
closest_subcluster.child_,
|
||||
threshold,
|
||||
branching_factor,
|
||||
)
|
||||
self.update_split_subclusters(
|
||||
closest_subcluster, new_subcluster1, new_subcluster2
|
||||
)
|
||||
|
||||
if len(self.subclusters_) > self.branching_factor:
|
||||
return True
|
||||
return False
|
||||
|
||||
# good to go!
|
||||
else:
|
||||
merged = closest_subcluster.merge_subcluster(subcluster, self.threshold)
|
||||
if merged:
|
||||
self.init_centroids_[closest_index] = closest_subcluster.centroid_
|
||||
self.init_sq_norm_[closest_index] = closest_subcluster.sq_norm_
|
||||
return False
|
||||
|
||||
# not close to any other subclusters, and we still
|
||||
# have space, so add.
|
||||
elif len(self.subclusters_) < self.branching_factor:
|
||||
self.append_subcluster(subcluster)
|
||||
return False
|
||||
|
||||
# We do not have enough space nor is it closer to an
|
||||
# other subcluster. We need to split.
|
||||
else:
|
||||
self.append_subcluster(subcluster)
|
||||
return True
|
||||
|
||||
|
||||
class _CFSubcluster:
|
||||
"""Each subcluster in a CFNode is called a CFSubcluster.
|
||||
|
||||
A CFSubcluster can have a CFNode has its child.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linear_sum : ndarray of shape (n_features,), default=None
|
||||
Sample. This is kept optional to allow initialization of empty
|
||||
subclusters.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
n_samples_ : int
|
||||
Number of samples that belong to each subcluster.
|
||||
|
||||
linear_sum_ : ndarray
|
||||
Linear sum of all the samples in a subcluster. Prevents holding
|
||||
all sample data in memory.
|
||||
|
||||
squared_sum_ : float
|
||||
Sum of the squared l2 norms of all samples belonging to a subcluster.
|
||||
|
||||
centroid_ : ndarray of shape (branching_factor + 1, n_features)
|
||||
Centroid of the subcluster. Prevent recomputing of centroids when
|
||||
``CFNode.centroids_`` is called.
|
||||
|
||||
child_ : _CFNode
|
||||
Child Node of the subcluster. Once a given _CFNode is set as the child
|
||||
of the _CFNode, it is set to ``self.child_``.
|
||||
|
||||
sq_norm_ : ndarray of shape (branching_factor + 1,)
|
||||
Squared norm of the subcluster. Used to prevent recomputing when
|
||||
pairwise minimum distances are computed.
|
||||
"""
|
||||
|
||||
def __init__(self, *, linear_sum=None):
|
||||
if linear_sum is None:
|
||||
self.n_samples_ = 0
|
||||
self.squared_sum_ = 0.0
|
||||
self.centroid_ = self.linear_sum_ = 0
|
||||
else:
|
||||
self.n_samples_ = 1
|
||||
self.centroid_ = self.linear_sum_ = linear_sum
|
||||
self.squared_sum_ = self.sq_norm_ = np.dot(
|
||||
self.linear_sum_, self.linear_sum_
|
||||
)
|
||||
self.child_ = None
|
||||
|
||||
def update(self, subcluster):
|
||||
self.n_samples_ += subcluster.n_samples_
|
||||
self.linear_sum_ += subcluster.linear_sum_
|
||||
self.squared_sum_ += subcluster.squared_sum_
|
||||
self.centroid_ = self.linear_sum_ / self.n_samples_
|
||||
self.sq_norm_ = np.dot(self.centroid_, self.centroid_)
|
||||
|
||||
def merge_subcluster(self, nominee_cluster, threshold):
|
||||
"""Check if a cluster is worthy enough to be merged. If
|
||||
yes then merge.
|
||||
"""
|
||||
new_ss = self.squared_sum_ + nominee_cluster.squared_sum_
|
||||
new_ls = self.linear_sum_ + nominee_cluster.linear_sum_
|
||||
new_n = self.n_samples_ + nominee_cluster.n_samples_
|
||||
new_centroid = (1 / new_n) * new_ls
|
||||
new_sq_norm = np.dot(new_centroid, new_centroid)
|
||||
|
||||
# The squared radius of the cluster is defined:
|
||||
# r^2 = sum_i ||x_i - c||^2 / n
|
||||
# with x_i the n points assigned to the cluster and c its centroid:
|
||||
# c = sum_i x_i / n
|
||||
# This can be expanded to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - 2 < sum_i x_i / n, c> + n ||c||^2 / n
|
||||
# and therefore simplifies to:
|
||||
# r^2 = sum_i ||x_i||^2 / n - ||c||^2
|
||||
sq_radius = new_ss / new_n - new_sq_norm
|
||||
|
||||
if sq_radius <= threshold**2:
|
||||
(
|
||||
self.n_samples_,
|
||||
self.linear_sum_,
|
||||
self.squared_sum_,
|
||||
self.centroid_,
|
||||
self.sq_norm_,
|
||||
) = (new_n, new_ls, new_ss, new_centroid, new_sq_norm)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def radius(self):
|
||||
"""Return radius of the subcluster"""
|
||||
# Because of numerical issues, this could become negative
|
||||
sq_radius = self.squared_sum_ / self.n_samples_ - self.sq_norm_
|
||||
return sqrt(max(0, sq_radius))
|
||||
|
||||
|
||||
class Birch(
|
||||
ClassNamePrefixFeaturesOutMixin, ClusterMixin, TransformerMixin, BaseEstimator
|
||||
):
|
||||
"""Implements the BIRCH clustering algorithm.
|
||||
|
||||
It is a memory-efficient, online-learning algorithm provided as an
|
||||
alternative to :class:`MiniBatchKMeans`. It constructs a tree
|
||||
data structure with the cluster centroids being read off the leaf.
|
||||
These can be either the final cluster centroids or can be provided as input
|
||||
to another clustering algorithm such as :class:`AgglomerativeClustering`.
|
||||
|
||||
Read more in the :ref:`User Guide <birch>`.
|
||||
|
||||
.. versionadded:: 0.16
|
||||
|
||||
Parameters
|
||||
----------
|
||||
threshold : float, default=0.5
|
||||
The radius of the subcluster obtained by merging a new sample and the
|
||||
closest subcluster should be lesser than the threshold. Otherwise a new
|
||||
subcluster is started. Setting this value to be very low promotes
|
||||
splitting and vice-versa.
|
||||
|
||||
branching_factor : int, default=50
|
||||
Maximum number of CF subclusters in each node. If a new samples enters
|
||||
such that the number of subclusters exceed the branching_factor then
|
||||
that node is split into two nodes with the subclusters redistributed
|
||||
in each. The parent subcluster of that node is removed and two new
|
||||
subclusters are added as parents of the 2 split nodes.
|
||||
|
||||
n_clusters : int, instance of sklearn.cluster model or None, default=3
|
||||
Number of clusters after the final clustering step, which treats the
|
||||
subclusters from the leaves as new samples.
|
||||
|
||||
- `None` : the final clustering step is not performed and the
|
||||
subclusters are returned as they are.
|
||||
|
||||
- :mod:`sklearn.cluster` Estimator : If a model is provided, the model
|
||||
is fit treating the subclusters as new samples and the initial data
|
||||
is mapped to the label of the closest subcluster.
|
||||
|
||||
- `int` : the model fit is :class:`AgglomerativeClustering` with
|
||||
`n_clusters` set to be equal to the int.
|
||||
|
||||
compute_labels : bool, default=True
|
||||
Whether or not to compute labels for each fit.
|
||||
|
||||
copy : bool, default=True
|
||||
Whether or not to make a copy of the given data. If set to False,
|
||||
the initial data will be overwritten.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
root_ : _CFNode
|
||||
Root of the CFTree.
|
||||
|
||||
dummy_leaf_ : _CFNode
|
||||
Start pointer to all the leaves.
|
||||
|
||||
subcluster_centers_ : ndarray
|
||||
Centroids of all subclusters read directly from the leaves.
|
||||
|
||||
subcluster_labels_ : ndarray
|
||||
Labels assigned to the centroids of the subclusters after
|
||||
they are clustered globally.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Array of labels assigned to the input data.
|
||||
if partial_fit is used instead of fit, they are assigned to the
|
||||
last batch of data.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
MiniBatchKMeans : Alternative implementation that does incremental updates
|
||||
of the centers' positions using mini-batches.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The tree data structure consists of nodes with each node consisting of
|
||||
a number of subclusters. The maximum number of subclusters in a node
|
||||
is determined by the branching factor. Each subcluster maintains a
|
||||
linear sum, squared sum and the number of samples in that subcluster.
|
||||
In addition, each subcluster can also have a node as its child, if the
|
||||
subcluster is not a member of a leaf node.
|
||||
|
||||
For a new point entering the root, it is merged with the subcluster closest
|
||||
to it and the linear sum, squared sum and the number of samples of that
|
||||
subcluster are updated. This is done recursively till the properties of
|
||||
the leaf node are updated.
|
||||
|
||||
References
|
||||
----------
|
||||
* Tian Zhang, Raghu Ramakrishnan, Maron Livny
|
||||
BIRCH: An efficient data clustering method for large databases.
|
||||
https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf
|
||||
|
||||
* Roberto Perdisci
|
||||
JBirch - Java implementation of BIRCH clustering algorithm
|
||||
https://code.google.com/archive/p/jbirch
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import Birch
|
||||
>>> X = [[0, 1], [0.3, 1], [-0.3, 1], [0, -1], [0.3, -1], [-0.3, -1]]
|
||||
>>> brc = Birch(n_clusters=None)
|
||||
>>> brc.fit(X)
|
||||
Birch(n_clusters=None)
|
||||
>>> brc.predict(X)
|
||||
array([0, 0, 0, 1, 1, 1])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"threshold": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"branching_factor": [Interval(Integral, 1, None, closed="neither")],
|
||||
"n_clusters": [None, ClusterMixin, Interval(Integral, 1, None, closed="left")],
|
||||
"compute_labels": ["boolean"],
|
||||
"copy": ["boolean"],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
threshold=0.5,
|
||||
branching_factor=50,
|
||||
n_clusters=3,
|
||||
compute_labels=True,
|
||||
copy=True,
|
||||
):
|
||||
self.threshold = threshold
|
||||
self.branching_factor = branching_factor
|
||||
self.n_clusters = n_clusters
|
||||
self.compute_labels = compute_labels
|
||||
self.copy = copy
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None):
|
||||
"""
|
||||
Build a CF Tree for the input data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
return self._fit(X, partial=False)
|
||||
|
||||
def _fit(self, X, partial):
|
||||
has_root = getattr(self, "root_", None)
|
||||
first_call = not (partial and has_root)
|
||||
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse="csr",
|
||||
copy=self.copy,
|
||||
reset=first_call,
|
||||
dtype=[np.float64, np.float32],
|
||||
)
|
||||
threshold = self.threshold
|
||||
branching_factor = self.branching_factor
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# If partial_fit is called for the first time or fit is called, we
|
||||
# start a new tree.
|
||||
if first_call:
|
||||
# The first root is the leaf. Manipulate this object throughout.
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
|
||||
# To enable getting back subclusters.
|
||||
self.dummy_leaf_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=True,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.dummy_leaf_.next_leaf_ = self.root_
|
||||
self.root_.prev_leaf_ = self.dummy_leaf_
|
||||
|
||||
# Cannot vectorize. Enough to convince to use cython.
|
||||
if not sparse.issparse(X):
|
||||
iter_func = iter
|
||||
else:
|
||||
iter_func = _iterate_sparse_X
|
||||
|
||||
for sample in iter_func(X):
|
||||
subcluster = _CFSubcluster(linear_sum=sample)
|
||||
split = self.root_.insert_cf_subcluster(subcluster)
|
||||
|
||||
if split:
|
||||
new_subcluster1, new_subcluster2 = _split_node(
|
||||
self.root_, threshold, branching_factor
|
||||
)
|
||||
del self.root_
|
||||
self.root_ = _CFNode(
|
||||
threshold=threshold,
|
||||
branching_factor=branching_factor,
|
||||
is_leaf=False,
|
||||
n_features=n_features,
|
||||
dtype=X.dtype,
|
||||
)
|
||||
self.root_.append_subcluster(new_subcluster1)
|
||||
self.root_.append_subcluster(new_subcluster2)
|
||||
|
||||
centroids = np.concatenate([leaf.centroids_ for leaf in self._get_leaves()])
|
||||
self.subcluster_centers_ = centroids
|
||||
self._n_features_out = self.subcluster_centers_.shape[0]
|
||||
|
||||
self._global_clustering(X)
|
||||
return self
|
||||
|
||||
def _get_leaves(self):
|
||||
"""
|
||||
Retrieve the leaves of the CF Node.
|
||||
|
||||
Returns
|
||||
-------
|
||||
leaves : list of shape (n_leaves,)
|
||||
List of the leaf nodes.
|
||||
"""
|
||||
leaf_ptr = self.dummy_leaf_.next_leaf_
|
||||
leaves = []
|
||||
while leaf_ptr is not None:
|
||||
leaves.append(leaf_ptr)
|
||||
leaf_ptr = leaf_ptr.next_leaf_
|
||||
return leaves
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def partial_fit(self, X=None, y=None):
|
||||
"""
|
||||
Online learning. Prevents rebuilding of CFTree from scratch.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), \
|
||||
default=None
|
||||
Input data. If X is not provided, only the global clustering
|
||||
step is done.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
if X is None:
|
||||
# Perform just the final global clustering step.
|
||||
self._global_clustering()
|
||||
return self
|
||||
else:
|
||||
return self._fit(X, partial=True)
|
||||
|
||||
def _check_fit(self, X):
|
||||
check_is_fitted(self)
|
||||
|
||||
if (
|
||||
hasattr(self, "subcluster_centers_")
|
||||
and X.shape[1] != self.subcluster_centers_.shape[1]
|
||||
):
|
||||
raise ValueError(
|
||||
"Training data and predicted data do not have same number of features."
|
||||
)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict data using the ``centroids_`` of subclusters.
|
||||
|
||||
Avoid computation of the row norms of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape(n_samples,)
|
||||
Labelled data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
return self._predict(X)
|
||||
|
||||
def _predict(self, X):
|
||||
"""Predict data using the ``centroids_`` of subclusters."""
|
||||
kwargs = {"Y_norm_squared": self._subcluster_norms}
|
||||
|
||||
with config_context(assume_finite=True):
|
||||
argmin = pairwise_distances_argmin(
|
||||
X, self.subcluster_centers_, metric_kwargs=kwargs
|
||||
)
|
||||
return self.subcluster_labels_[argmin]
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform X into subcluster centroids dimension.
|
||||
|
||||
Each dimension represents the distance from the sample point to each
|
||||
cluster centroid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Input data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_trans : {array-like, sparse matrix} of shape (n_samples, n_clusters)
|
||||
Transformed data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = self._validate_data(X, accept_sparse="csr", reset=False)
|
||||
with config_context(assume_finite=True):
|
||||
return euclidean_distances(X, self.subcluster_centers_)
|
||||
|
||||
def _global_clustering(self, X=None):
|
||||
"""
|
||||
Global clustering for the subclusters obtained after fitting
|
||||
"""
|
||||
clusterer = self.n_clusters
|
||||
centroids = self.subcluster_centers_
|
||||
compute_labels = (X is not None) and self.compute_labels
|
||||
|
||||
# Preprocessing for the global clustering.
|
||||
not_enough_centroids = False
|
||||
if isinstance(clusterer, Integral):
|
||||
clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
|
||||
# There is no need to perform the global clustering step.
|
||||
if len(centroids) < self.n_clusters:
|
||||
not_enough_centroids = True
|
||||
|
||||
# To use in predict to avoid recalculation.
|
||||
self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True)
|
||||
|
||||
if clusterer is None or not_enough_centroids:
|
||||
self.subcluster_labels_ = np.arange(len(centroids))
|
||||
if not_enough_centroids:
|
||||
warnings.warn(
|
||||
"Number of subclusters found (%d) by BIRCH is less "
|
||||
"than (%d). Decrease the threshold."
|
||||
% (len(centroids), self.n_clusters),
|
||||
ConvergenceWarning,
|
||||
)
|
||||
else:
|
||||
# The global clustering step that clusters the subclusters of
|
||||
# the leaves. It assumes the centroids of the subclusters as
|
||||
# samples and finds the final centroids.
|
||||
self.subcluster_labels_ = clusterer.fit_predict(self.subcluster_centers_)
|
||||
|
||||
if compute_labels:
|
||||
self.labels_ = self._predict(X)
|
||||
|
||||
def _more_tags(self):
|
||||
return {"preserves_dtype": [np.float64, np.float32]}
|
||||
@@ -0,0 +1,530 @@
|
||||
"""Bisecting K-means clustering."""
|
||||
|
||||
# Author: Michal Krawczyk <mkrwczyk.1@gmail.com>
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
from ..base import _fit_context
|
||||
from ..utils._openmp_helpers import _openmp_effective_n_threads
|
||||
from ..utils._param_validation import Integral, Interval, StrOptions
|
||||
from ..utils.extmath import row_norms
|
||||
from ..utils.validation import _check_sample_weight, check_is_fitted, check_random_state
|
||||
from ._k_means_common import _inertia_dense, _inertia_sparse
|
||||
from ._kmeans import (
|
||||
_BaseKMeans,
|
||||
_kmeans_single_elkan,
|
||||
_kmeans_single_lloyd,
|
||||
_labels_inertia_threadpool_limit,
|
||||
)
|
||||
|
||||
|
||||
class _BisectingTree:
|
||||
"""Tree structure representing the hierarchical clusters of BisectingKMeans."""
|
||||
|
||||
def __init__(self, center, indices, score):
|
||||
"""Create a new cluster node in the tree.
|
||||
|
||||
The node holds the center of this cluster and the indices of the data points
|
||||
that belong to it.
|
||||
"""
|
||||
self.center = center
|
||||
self.indices = indices
|
||||
self.score = score
|
||||
|
||||
self.left = None
|
||||
self.right = None
|
||||
|
||||
def split(self, labels, centers, scores):
|
||||
"""Split the cluster node into two subclusters."""
|
||||
self.left = _BisectingTree(
|
||||
indices=self.indices[labels == 0], center=centers[0], score=scores[0]
|
||||
)
|
||||
self.right = _BisectingTree(
|
||||
indices=self.indices[labels == 1], center=centers[1], score=scores[1]
|
||||
)
|
||||
|
||||
# reset the indices attribute to save memory
|
||||
self.indices = None
|
||||
|
||||
def get_cluster_to_bisect(self):
|
||||
"""Return the cluster node to bisect next.
|
||||
|
||||
It's based on the score of the cluster, which can be either the number of
|
||||
data points assigned to that cluster or the inertia of that cluster
|
||||
(see `bisecting_strategy` for details).
|
||||
"""
|
||||
max_score = None
|
||||
|
||||
for cluster_leaf in self.iter_leaves():
|
||||
if max_score is None or cluster_leaf.score > max_score:
|
||||
max_score = cluster_leaf.score
|
||||
best_cluster_leaf = cluster_leaf
|
||||
|
||||
return best_cluster_leaf
|
||||
|
||||
def iter_leaves(self):
|
||||
"""Iterate over all the cluster leaves in the tree."""
|
||||
if self.left is None:
|
||||
yield self
|
||||
else:
|
||||
yield from self.left.iter_leaves()
|
||||
yield from self.right.iter_leaves()
|
||||
|
||||
|
||||
class BisectingKMeans(_BaseKMeans):
|
||||
"""Bisecting K-Means clustering.
|
||||
|
||||
Read more in the :ref:`User Guide <bisect_k_means>`.
|
||||
|
||||
.. versionadded:: 1.1
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_clusters : int, default=8
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate.
|
||||
|
||||
init : {'k-means++', 'random'} or callable, default='random'
|
||||
Method for initialization:
|
||||
|
||||
'k-means++' : selects initial cluster centers for k-mean
|
||||
clustering in a smart way to speed up convergence. See section
|
||||
Notes in k_init for more details.
|
||||
|
||||
'random': choose `n_clusters` observations (rows) at random from data
|
||||
for the initial centroids.
|
||||
|
||||
If a callable is passed, it should take arguments X, n_clusters and a
|
||||
random state and return an initialization.
|
||||
|
||||
n_init : int, default=1
|
||||
Number of time the inner k-means algorithm will be run with different
|
||||
centroid seeds in each bisection.
|
||||
That will result producing for each bisection best output of n_init
|
||||
consecutive runs in terms of inertia.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for centroid initialization
|
||||
in inner K-Means. Use an int to make the randomness deterministic.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
max_iter : int, default=300
|
||||
Maximum number of iterations of the inner k-means algorithm at each
|
||||
bisection.
|
||||
|
||||
verbose : int, default=0
|
||||
Verbosity mode.
|
||||
|
||||
tol : float, default=1e-4
|
||||
Relative tolerance with regards to Frobenius norm of the difference
|
||||
in the cluster centers of two consecutive iterations to declare
|
||||
convergence. Used in inner k-means algorithm at each bisection to pick
|
||||
best possible clusters.
|
||||
|
||||
copy_x : bool, default=True
|
||||
When pre-computing distances it is more numerically accurate to center
|
||||
the data first. If copy_x is True (default), then the original data is
|
||||
not modified. If False, the original data is modified, and put back
|
||||
before the function returns, but small numerical differences may be
|
||||
introduced by subtracting and then adding the data mean. Note that if
|
||||
the original data is not C-contiguous, a copy will be made even if
|
||||
copy_x is False. If the original data is sparse, but not in CSR format,
|
||||
a copy will be made even if copy_x is False.
|
||||
|
||||
algorithm : {"lloyd", "elkan"}, default="lloyd"
|
||||
Inner K-means algorithm used in bisection.
|
||||
The classical EM-style algorithm is `"lloyd"`.
|
||||
The `"elkan"` variation can be more efficient on some datasets with
|
||||
well-defined clusters, by using the triangle inequality. However it's
|
||||
more memory intensive due to the allocation of an extra array of shape
|
||||
`(n_samples, n_clusters)`.
|
||||
|
||||
bisecting_strategy : {"biggest_inertia", "largest_cluster"},\
|
||||
default="biggest_inertia"
|
||||
Defines how bisection should be performed:
|
||||
|
||||
- "biggest_inertia" means that BisectingKMeans will always check
|
||||
all calculated cluster for cluster with biggest SSE
|
||||
(Sum of squared errors) and bisect it. This approach concentrates on
|
||||
precision, but may be costly in terms of execution time (especially for
|
||||
larger amount of data points).
|
||||
|
||||
- "largest_cluster" - BisectingKMeans will always split cluster with
|
||||
largest amount of points assigned to it from all clusters
|
||||
previously calculated. That should work faster than picking by SSE
|
||||
('biggest_inertia') and may produce similar results in most cases.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cluster_centers_ : ndarray of shape (n_clusters, n_features)
|
||||
Coordinates of cluster centers. If the algorithm stops before fully
|
||||
converging (see ``tol`` and ``max_iter``), these will not be
|
||||
consistent with ``labels_``.
|
||||
|
||||
labels_ : ndarray of shape (n_samples,)
|
||||
Labels of each point.
|
||||
|
||||
inertia_ : float
|
||||
Sum of squared distances of samples to their closest cluster center,
|
||||
weighted by the sample weights if provided.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
See Also
|
||||
--------
|
||||
KMeans : Original implementation of K-Means algorithm.
|
||||
|
||||
Notes
|
||||
-----
|
||||
It might be inefficient when n_cluster is less than 3, due to unnecessary
|
||||
calculations for that case.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import BisectingKMeans
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 1], [10, 1], [3, 1],
|
||||
... [10, 0], [2, 1], [10, 2],
|
||||
... [10, 8], [10, 9], [10, 10]])
|
||||
>>> bisect_means = BisectingKMeans(n_clusters=3, random_state=0).fit(X)
|
||||
>>> bisect_means.labels_
|
||||
array([0, 2, 0, 2, 0, 2, 1, 1, 1], dtype=int32)
|
||||
>>> bisect_means.predict([[0, 0], [12, 3]])
|
||||
array([0, 2], dtype=int32)
|
||||
>>> bisect_means.cluster_centers_
|
||||
array([[ 2., 1.],
|
||||
[10., 9.],
|
||||
[10., 1.]])
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
**_BaseKMeans._parameter_constraints,
|
||||
"init": [StrOptions({"k-means++", "random"}), callable],
|
||||
"n_init": [Interval(Integral, 1, None, closed="left")],
|
||||
"copy_x": ["boolean"],
|
||||
"algorithm": [StrOptions({"lloyd", "elkan"})],
|
||||
"bisecting_strategy": [StrOptions({"biggest_inertia", "largest_cluster"})],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_clusters=8,
|
||||
*,
|
||||
init="random",
|
||||
n_init=1,
|
||||
random_state=None,
|
||||
max_iter=300,
|
||||
verbose=0,
|
||||
tol=1e-4,
|
||||
copy_x=True,
|
||||
algorithm="lloyd",
|
||||
bisecting_strategy="biggest_inertia",
|
||||
):
|
||||
super().__init__(
|
||||
n_clusters=n_clusters,
|
||||
init=init,
|
||||
max_iter=max_iter,
|
||||
verbose=verbose,
|
||||
random_state=random_state,
|
||||
tol=tol,
|
||||
n_init=n_init,
|
||||
)
|
||||
|
||||
self.copy_x = copy_x
|
||||
self.algorithm = algorithm
|
||||
self.bisecting_strategy = bisecting_strategy
|
||||
|
||||
def _warn_mkl_vcomp(self, n_active_threads):
|
||||
"""Warn when vcomp and mkl are both present"""
|
||||
warnings.warn(
|
||||
"BisectingKMeans is known to have a memory leak on Windows "
|
||||
"with MKL, when there are less chunks than available "
|
||||
"threads. You can avoid it by setting the environment"
|
||||
f" variable OMP_NUM_THREADS={n_active_threads}."
|
||||
)
|
||||
|
||||
def _inertia_per_cluster(self, X, centers, labels, sample_weight):
|
||||
"""Calculate the sum of squared errors (inertia) per cluster.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
The input samples.
|
||||
|
||||
centers : ndarray of shape (n_clusters=2, n_features)
|
||||
The cluster centers.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
inertia_per_cluster : ndarray of shape (n_clusters=2,)
|
||||
Sum of squared errors (inertia) for each cluster.
|
||||
"""
|
||||
n_clusters = centers.shape[0] # = 2 since centers comes from a bisection
|
||||
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
|
||||
|
||||
inertia_per_cluster = np.empty(n_clusters)
|
||||
for label in range(n_clusters):
|
||||
inertia_per_cluster[label] = _inertia(
|
||||
X, sample_weight, centers, labels, self._n_threads, single_label=label
|
||||
)
|
||||
|
||||
return inertia_per_cluster
|
||||
|
||||
def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
|
||||
"""Split a cluster into 2 subsclusters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
Training instances to cluster.
|
||||
|
||||
x_squared_norms : ndarray of shape (n_samples,)
|
||||
Squared euclidean norm of each data point.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
cluster_to_bisect : _BisectingTree node object
|
||||
The cluster node to split.
|
||||
"""
|
||||
X = X[cluster_to_bisect.indices]
|
||||
x_squared_norms = x_squared_norms[cluster_to_bisect.indices]
|
||||
sample_weight = sample_weight[cluster_to_bisect.indices]
|
||||
|
||||
best_inertia = None
|
||||
|
||||
# Split samples in X into 2 clusters.
|
||||
# Repeating `n_init` times to obtain best clusters
|
||||
for _ in range(self.n_init):
|
||||
centers_init = self._init_centroids(
|
||||
X,
|
||||
x_squared_norms=x_squared_norms,
|
||||
init=self.init,
|
||||
random_state=self._random_state,
|
||||
n_centroids=2,
|
||||
sample_weight=sample_weight,
|
||||
)
|
||||
|
||||
labels, inertia, centers, _ = self._kmeans_single(
|
||||
X,
|
||||
sample_weight,
|
||||
centers_init,
|
||||
max_iter=self.max_iter,
|
||||
verbose=self.verbose,
|
||||
tol=self.tol,
|
||||
n_threads=self._n_threads,
|
||||
)
|
||||
|
||||
# allow small tolerance on the inertia to accommodate for
|
||||
# non-deterministic rounding errors due to parallel computation
|
||||
if best_inertia is None or inertia < best_inertia * (1 - 1e-6):
|
||||
best_labels = labels
|
||||
best_centers = centers
|
||||
best_inertia = inertia
|
||||
|
||||
if self.verbose:
|
||||
print(f"New centroids from bisection: {best_centers}")
|
||||
|
||||
if self.bisecting_strategy == "biggest_inertia":
|
||||
scores = self._inertia_per_cluster(
|
||||
X, best_centers, best_labels, sample_weight
|
||||
)
|
||||
else: # bisecting_strategy == "largest_cluster"
|
||||
# Using minlength to make sure that we have the counts for both labels even
|
||||
# if all samples are labelled 0.
|
||||
scores = np.bincount(best_labels, minlength=2)
|
||||
|
||||
cluster_to_bisect.split(best_labels, best_centers, scores)
|
||||
|
||||
@_fit_context(prefer_skip_nested_validation=True)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Compute bisecting k-means clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
|
||||
Training instances to cluster.
|
||||
|
||||
.. note:: The data will be converted to C ordering,
|
||||
which will cause a memory copy
|
||||
if the given data is not C-contiguous.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
The weights for each observation in X. If None, all observations
|
||||
are assigned equal weight. `sample_weight` is not used during
|
||||
initialization if `init` is a callable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
Fitted estimator.
|
||||
"""
|
||||
X = self._validate_data(
|
||||
X,
|
||||
accept_sparse="csr",
|
||||
dtype=[np.float64, np.float32],
|
||||
order="C",
|
||||
copy=self.copy_x,
|
||||
accept_large_sparse=False,
|
||||
)
|
||||
|
||||
self._check_params_vs_input(X)
|
||||
|
||||
self._random_state = check_random_state(self.random_state)
|
||||
sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
|
||||
self._n_threads = _openmp_effective_n_threads()
|
||||
|
||||
if self.algorithm == "lloyd" or self.n_clusters == 1:
|
||||
self._kmeans_single = _kmeans_single_lloyd
|
||||
self._check_mkl_vcomp(X, X.shape[0])
|
||||
else:
|
||||
self._kmeans_single = _kmeans_single_elkan
|
||||
|
||||
# Subtract of mean of X for more accurate distance computations
|
||||
if not sp.issparse(X):
|
||||
self._X_mean = X.mean(axis=0)
|
||||
X -= self._X_mean
|
||||
|
||||
# Initialize the hierarchical clusters tree
|
||||
self._bisecting_tree = _BisectingTree(
|
||||
indices=np.arange(X.shape[0]),
|
||||
center=X.mean(axis=0),
|
||||
score=0,
|
||||
)
|
||||
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
|
||||
for _ in range(self.n_clusters - 1):
|
||||
# Chose cluster to bisect
|
||||
cluster_to_bisect = self._bisecting_tree.get_cluster_to_bisect()
|
||||
|
||||
# Split this cluster into 2 subclusters
|
||||
self._bisect(X, x_squared_norms, sample_weight, cluster_to_bisect)
|
||||
|
||||
# Aggregate final labels and centers from the bisecting tree
|
||||
self.labels_ = np.full(X.shape[0], -1, dtype=np.int32)
|
||||
self.cluster_centers_ = np.empty((self.n_clusters, X.shape[1]), dtype=X.dtype)
|
||||
|
||||
for i, cluster_node in enumerate(self._bisecting_tree.iter_leaves()):
|
||||
self.labels_[cluster_node.indices] = i
|
||||
self.cluster_centers_[i] = cluster_node.center
|
||||
cluster_node.label = i # label final clusters for future prediction
|
||||
cluster_node.indices = None # release memory
|
||||
|
||||
# Restore original data
|
||||
if not sp.issparse(X):
|
||||
X += self._X_mean
|
||||
self.cluster_centers_ += self._X_mean
|
||||
|
||||
_inertia = _inertia_sparse if sp.issparse(X) else _inertia_dense
|
||||
self.inertia_ = _inertia(
|
||||
X, sample_weight, self.cluster_centers_, self.labels_, self._n_threads
|
||||
)
|
||||
|
||||
self._n_features_out = self.cluster_centers_.shape[0]
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict which cluster each sample in X belongs to.
|
||||
|
||||
Prediction is made by going down the hierarchical tree
|
||||
in searching of closest leaf cluster.
|
||||
|
||||
In the vector quantization literature, `cluster_centers_` is called
|
||||
the code book and each value returned by `predict` is the index of
|
||||
the closest code in the code book.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
New data to predict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._check_test_data(X)
|
||||
x_squared_norms = row_norms(X, squared=True)
|
||||
|
||||
# sample weights are unused but necessary in cython helpers
|
||||
sample_weight = np.ones_like(x_squared_norms)
|
||||
|
||||
labels = self._predict_recursive(X, sample_weight, self._bisecting_tree)
|
||||
|
||||
return labels
|
||||
|
||||
def _predict_recursive(self, X, sample_weight, cluster_node):
|
||||
"""Predict recursively by going down the hierarchical tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {ndarray, csr_matrix} of shape (n_samples, n_features)
|
||||
The data points, currently assigned to `cluster_node`, to predict between
|
||||
the subclusters of this node.
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,)
|
||||
The weights for each observation in X.
|
||||
|
||||
cluster_node : _BisectingTree node object
|
||||
The cluster node of the hierarchical tree.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Index of the cluster each sample belongs to.
|
||||
"""
|
||||
if cluster_node.left is None:
|
||||
# This cluster has no subcluster. Labels are just the label of the cluster.
|
||||
return np.full(X.shape[0], cluster_node.label, dtype=np.int32)
|
||||
|
||||
# Determine if data points belong to the left or right subcluster
|
||||
centers = np.vstack((cluster_node.left.center, cluster_node.right.center))
|
||||
if hasattr(self, "_X_mean"):
|
||||
centers += self._X_mean
|
||||
|
||||
cluster_labels = _labels_inertia_threadpool_limit(
|
||||
X,
|
||||
sample_weight,
|
||||
centers,
|
||||
self._n_threads,
|
||||
return_inertia=False,
|
||||
)
|
||||
mask = cluster_labels == 0
|
||||
|
||||
# Compute the labels for each subset of the data points.
|
||||
labels = np.full(X.shape[0], -1, dtype=np.int32)
|
||||
|
||||
labels[mask] = self._predict_recursive(
|
||||
X[mask], sample_weight[mask], cluster_node.left
|
||||
)
|
||||
|
||||
labels[~mask] = self._predict_recursive(
|
||||
X[~mask], sample_weight[~mask], cluster_node.right
|
||||
)
|
||||
|
||||
return labels
|
||||
|
||||
def _more_tags(self):
|
||||
return {"preserves_dtype": [np.float64, np.float32]}
|
||||
478
.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py
Normal file
478
.venv/lib/python3.12/site-packages/sklearn/cluster/_dbscan.py
Normal file
@@ -0,0 +1,478 @@
|
||||
"""
|
||||
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
|
||||
"""
|
||||
|
||||
# Author: Robert Layton <robertlayton@gmail.com>
|
||||
# Joel Nothman <joel.nothman@gmail.com>
|
||||
# Lars Buitinck
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
from numbers import Integral, Real
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
|
||||
from ..base import BaseEstimator, ClusterMixin, _fit_context
|
||||
from ..metrics.pairwise import _VALID_METRICS
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ._dbscan_inner import dbscan_inner
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"X": ["array-like", "sparse matrix"],
|
||||
"sample_weight": ["array-like", None],
|
||||
},
|
||||
prefer_skip_nested_validation=False,
|
||||
)
|
||||
def dbscan(
|
||||
X,
|
||||
eps=0.5,
|
||||
*,
|
||||
min_samples=5,
|
||||
metric="minkowski",
|
||||
metric_params=None,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=2,
|
||||
sample_weight=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse (CSR) matrix} of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
A feature array, or array of distances between samples if
|
||||
``metric='precomputed'``.
|
||||
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point
|
||||
to be considered as a core point. This includes the point itself.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit.
|
||||
X may be a :term:`sparse graph <sparse graph>`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
See NearestNeighbors module documentation for details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem.
|
||||
|
||||
p : float, default=2
|
||||
The power of the Minkowski metric to be used to calculate distance
|
||||
between points.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with negative
|
||||
weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search. ``None`` means
|
||||
1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means
|
||||
using all processors. See :term:`Glossary <n_jobs>` for more details.
|
||||
If precomputed distance are used, parallel execution is not available
|
||||
and thus n_jobs will have no effect.
|
||||
|
||||
Returns
|
||||
-------
|
||||
core_samples : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels for each point. Noisy samples are given the label -1.
|
||||
|
||||
See Also
|
||||
--------
|
||||
DBSCAN : An estimator interface for this clustering algorithm.
|
||||
OPTICS : A similar estimator interface clustering at multiple values of
|
||||
eps. Our implementation is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||||
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||||
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower
|
||||
memory usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
|
||||
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
|
||||
<10.1145/3068335>`
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import dbscan
|
||||
>>> X = [[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]
|
||||
>>> core_samples, labels = dbscan(X, eps=3, min_samples=2)
|
||||
>>> core_samples
|
||||
array([0, 1, 2, 3, 4])
|
||||
>>> labels
|
||||
array([ 0, 0, 0, 1, 1, -1])
|
||||
"""
|
||||
|
||||
est = DBSCAN(
|
||||
eps=eps,
|
||||
min_samples=min_samples,
|
||||
metric=metric,
|
||||
metric_params=metric_params,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
p=p,
|
||||
n_jobs=n_jobs,
|
||||
)
|
||||
est.fit(X, sample_weight=sample_weight)
|
||||
return est.core_sample_indices_, est.labels_
|
||||
|
||||
|
||||
class DBSCAN(ClusterMixin, BaseEstimator):
|
||||
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||||
|
||||
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
|
||||
Finds core samples of high density and expands clusters from them.
|
||||
Good for data which contains clusters of similar density.
|
||||
|
||||
This implementation has a worst case memory complexity of :math:`O({n}^2)`,
|
||||
which can occur when the `eps` param is large and `min_samples` is low,
|
||||
while the original DBSCAN only uses linear memory.
|
||||
For further details, see the Notes below.
|
||||
|
||||
Read more in the :ref:`User Guide <dbscan>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eps : float, default=0.5
|
||||
The maximum distance between two samples for one to be considered
|
||||
as in the neighborhood of the other. This is not a maximum bound
|
||||
on the distances of points within a cluster. This is the most
|
||||
important DBSCAN parameter to choose appropriately for your data set
|
||||
and distance function.
|
||||
|
||||
min_samples : int, default=5
|
||||
The number of samples (or total weight) in a neighborhood for a point to
|
||||
be considered as a core point. This includes the point itself. If
|
||||
`min_samples` is set to a higher value, DBSCAN will find denser clusters,
|
||||
whereas if it is set to a lower value, the found clusters will be more
|
||||
sparse.
|
||||
|
||||
metric : str, or callable, default='euclidean'
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by :func:`sklearn.metrics.pairwise_distances` for
|
||||
its metric parameter.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a :term:`sparse graph`, in which
|
||||
case only "nonzero" elements may be considered neighbors for DBSCAN.
|
||||
|
||||
.. versionadded:: 0.17
|
||||
metric *precomputed* to accept precomputed sparse matrix.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
The algorithm to be used by the NearestNeighbors module
|
||||
to compute pointwise distances and find nearest neighbors.
|
||||
See NearestNeighbors module documentation for details.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||||
of the construction and query, as well as the memory required
|
||||
to store the tree. The optimal value depends
|
||||
on the nature of the problem.
|
||||
|
||||
p : float, default=None
|
||||
The power of the Minkowski metric to be used to calculate distance
|
||||
between points. If None, then ``p=2`` (equivalent to the Euclidean
|
||||
distance).
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
core_sample_indices_ : ndarray of shape (n_core_samples,)
|
||||
Indices of core samples.
|
||||
|
||||
components_ : ndarray of shape (n_core_samples, n_features)
|
||||
Copy of each core sample found by training.
|
||||
|
||||
labels_ : ndarray of shape (n_samples)
|
||||
Cluster labels for each point in the dataset given to fit().
|
||||
Noisy samples are given the label -1.
|
||||
|
||||
n_features_in_ : int
|
||||
Number of features seen during :term:`fit`.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
||||
Names of features seen during :term:`fit`. Defined only when `X`
|
||||
has feature names that are all strings.
|
||||
|
||||
.. versionadded:: 1.0
|
||||
|
||||
See Also
|
||||
--------
|
||||
OPTICS : A similar clustering at multiple values of eps. Our implementation
|
||||
is optimized for memory usage.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||||
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||||
|
||||
This implementation bulk-computes all neighborhood queries, which increases
|
||||
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||||
while original DBSCAN had memory complexity O(n). It may attract a higher
|
||||
memory complexity when querying these nearest neighborhoods, depending
|
||||
on the ``algorithm``.
|
||||
|
||||
One way to avoid the query complexity is to pre-compute sparse
|
||||
neighborhoods in chunks using
|
||||
:func:`NearestNeighbors.radius_neighbors_graph
|
||||
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>` with
|
||||
``mode='distance'``, then using ``metric='precomputed'`` here.
|
||||
|
||||
Another way to reduce memory and computation time is to remove
|
||||
(near-)duplicate points and use ``sample_weight`` instead.
|
||||
|
||||
:class:`~sklearn.cluster.OPTICS` provides a similar clustering with lower memory
|
||||
usage.
|
||||
|
||||
References
|
||||
----------
|
||||
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, `"A Density-Based
|
||||
Algorithm for Discovering Clusters in Large Spatial Databases with Noise"
|
||||
<https://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf>`_.
|
||||
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||||
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||||
|
||||
Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017).
|
||||
:doi:`"DBSCAN revisited, revisited: why and how you should (still) use DBSCAN."
|
||||
<10.1145/3068335>`
|
||||
ACM Transactions on Database Systems (TODS), 42(3), 19.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import DBSCAN
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[1, 2], [2, 2], [2, 3],
|
||||
... [8, 7], [8, 8], [25, 80]])
|
||||
>>> clustering = DBSCAN(eps=3, min_samples=2).fit(X)
|
||||
>>> clustering.labels_
|
||||
array([ 0, 0, 0, 1, 1, -1])
|
||||
>>> clustering
|
||||
DBSCAN(eps=3, min_samples=2)
|
||||
"""
|
||||
|
||||
_parameter_constraints: dict = {
|
||||
"eps": [Interval(Real, 0.0, None, closed="neither")],
|
||||
"min_samples": [Interval(Integral, 1, None, closed="left")],
|
||||
"metric": [
|
||||
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
|
||||
callable,
|
||||
],
|
||||
"metric_params": [dict, None],
|
||||
"algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})],
|
||||
"leaf_size": [Interval(Integral, 1, None, closed="left")],
|
||||
"p": [Interval(Real, 0.0, None, closed="left"), None],
|
||||
"n_jobs": [Integral, None],
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
eps=0.5,
|
||||
*,
|
||||
min_samples=5,
|
||||
metric="euclidean",
|
||||
metric_params=None,
|
||||
algorithm="auto",
|
||||
leaf_size=30,
|
||||
p=None,
|
||||
n_jobs=None,
|
||||
):
|
||||
self.eps = eps
|
||||
self.min_samples = min_samples
|
||||
self.metric = metric
|
||||
self.metric_params = metric_params
|
||||
self.algorithm = algorithm
|
||||
self.leaf_size = leaf_size
|
||||
self.p = p
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
@_fit_context(
|
||||
# DBSCAN.metric is not validated yet
|
||||
prefer_skip_nested_validation=False
|
||||
)
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Perform DBSCAN clustering from features, or distance matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns a fitted instance of self.
|
||||
"""
|
||||
X = self._validate_data(X, accept_sparse="csr")
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
|
||||
# Calculate neighborhood for all samples. This leaves the original
|
||||
# point in, which needs to be considered later (i.e. point i is in the
|
||||
# neighborhood of point i. While True, its useless information)
|
||||
if self.metric == "precomputed" and sparse.issparse(X):
|
||||
# set the diagonal to explicit values, as a point is its own
|
||||
# neighbor
|
||||
X = X.copy() # copy to avoid in-place modification
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", sparse.SparseEfficiencyWarning)
|
||||
X.setdiag(X.diagonal())
|
||||
|
||||
neighbors_model = NearestNeighbors(
|
||||
radius=self.eps,
|
||||
algorithm=self.algorithm,
|
||||
leaf_size=self.leaf_size,
|
||||
metric=self.metric,
|
||||
metric_params=self.metric_params,
|
||||
p=self.p,
|
||||
n_jobs=self.n_jobs,
|
||||
)
|
||||
neighbors_model.fit(X)
|
||||
# This has worst case O(n^2) memory complexity
|
||||
neighborhoods = neighbors_model.radius_neighbors(X, return_distance=False)
|
||||
|
||||
if sample_weight is None:
|
||||
n_neighbors = np.array([len(neighbors) for neighbors in neighborhoods])
|
||||
else:
|
||||
n_neighbors = np.array(
|
||||
[np.sum(sample_weight[neighbors]) for neighbors in neighborhoods]
|
||||
)
|
||||
|
||||
# Initially, all samples are noise.
|
||||
labels = np.full(X.shape[0], -1, dtype=np.intp)
|
||||
|
||||
# A list of all core samples found.
|
||||
core_samples = np.asarray(n_neighbors >= self.min_samples, dtype=np.uint8)
|
||||
dbscan_inner(core_samples, neighborhoods, labels)
|
||||
|
||||
self.core_sample_indices_ = np.where(core_samples)[0]
|
||||
self.labels_ = labels
|
||||
|
||||
if len(self.core_sample_indices_):
|
||||
# fix for scipy sparse indexing issue
|
||||
self.components_ = X[self.core_sample_indices_].copy()
|
||||
else:
|
||||
# no core samples
|
||||
self.components_ = np.empty((0, X.shape[1]))
|
||||
return self
|
||||
|
||||
def fit_predict(self, X, y=None, sample_weight=None):
|
||||
"""Compute clusters from a data or distance matrix and predict labels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features), or \
|
||||
(n_samples, n_samples)
|
||||
Training instances to cluster, or distances between instances if
|
||||
``metric='precomputed'``. If a sparse matrix is provided, it will
|
||||
be converted into a sparse ``csr_matrix``.
|
||||
|
||||
y : Ignored
|
||||
Not used, present here for API consistency by convention.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Weight of each sample, such that a sample with a weight of at least
|
||||
``min_samples`` is by itself a core sample; a sample with a
|
||||
negative weight may inhibit its eps-neighbor from being core.
|
||||
Note that weights are absolute, and default to 1.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
Cluster labels. Noisy samples are given the label -1.
|
||||
"""
|
||||
self.fit(X, sample_weight=sample_weight)
|
||||
return self.labels_
|
||||
|
||||
def _more_tags(self):
|
||||
return {"pairwise": self.metric == "precomputed"}
|
||||
Binary file not shown.
@@ -0,0 +1,40 @@
|
||||
# Fast inner loop for DBSCAN.
|
||||
# Author: Lars Buitinck
|
||||
# License: 3-clause BSD
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from ..utils._typedefs cimport uint8_t, intp_t
|
||||
|
||||
|
||||
def dbscan_inner(const uint8_t[::1] is_core,
|
||||
object[:] neighborhoods,
|
||||
intp_t[::1] labels):
|
||||
cdef intp_t i, label_num = 0, v
|
||||
cdef intp_t[:] neighb
|
||||
cdef vector[intp_t] stack
|
||||
|
||||
for i in range(labels.shape[0]):
|
||||
if labels[i] != -1 or not is_core[i]:
|
||||
continue
|
||||
|
||||
# Depth-first search starting from i, ending at the non-core points.
|
||||
# This is very similar to the classic algorithm for computing connected
|
||||
# components, the difference being that we label non-core points as
|
||||
# part of a cluster (component), but don't expand their neighborhoods.
|
||||
while True:
|
||||
if labels[i] == -1:
|
||||
labels[i] = label_num
|
||||
if is_core[i]:
|
||||
neighb = neighborhoods[i]
|
||||
for i in range(neighb.shape[0]):
|
||||
v = neighb[i]
|
||||
if labels[v] == -1:
|
||||
stack.push_back(v)
|
||||
|
||||
if stack.size() == 0:
|
||||
break
|
||||
i = stack.back()
|
||||
stack.pop_back()
|
||||
|
||||
label_num += 1
|
||||
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
Feature agglomeration. Base classes and functions for performing feature
|
||||
agglomeration.
|
||||
"""
|
||||
|
||||
# Author: V. Michel, A. Gramfort
|
||||
# License: BSD 3 clause
|
||||
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
|
||||
from ..base import TransformerMixin
|
||||
from ..utils import metadata_routing
|
||||
from ..utils.deprecation import _deprecate_Xt_in_inverse_transform
|
||||
from ..utils.validation import check_is_fitted
|
||||
|
||||
###############################################################################
|
||||
# Mixin class for feature agglomeration.
|
||||
|
||||
|
||||
class AgglomerationTransform(TransformerMixin):
|
||||
"""
|
||||
A class for feature agglomeration via the transform interface.
|
||||
"""
|
||||
|
||||
# This prevents ``set_split_inverse_transform`` to be generated for the
|
||||
# non-standard ``Xt`` arg on ``inverse_transform``.
|
||||
# TODO(1.7): remove when Xt is removed for inverse_transform.
|
||||
__metadata_request__inverse_transform = {"Xt": metadata_routing.UNUSED}
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
Transform a new matrix using the built clustering.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or \
|
||||
(n_samples, n_samples)
|
||||
A M by N array of M observations in N dimensions or a length
|
||||
M array of M one-dimensional observations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Y : ndarray of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The pooled values for each feature cluster.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = self._validate_data(X, reset=False)
|
||||
if self.pooling_func == np.mean and not issparse(X):
|
||||
size = np.bincount(self.labels_)
|
||||
n_samples = X.shape[0]
|
||||
# a fast way to compute the mean of grouped features
|
||||
nX = np.array(
|
||||
[np.bincount(self.labels_, X[i, :]) / size for i in range(n_samples)]
|
||||
)
|
||||
else:
|
||||
nX = [
|
||||
self.pooling_func(X[:, self.labels_ == l], axis=1)
|
||||
for l in np.unique(self.labels_)
|
||||
]
|
||||
nX = np.array(nX).T
|
||||
return nX
|
||||
|
||||
def inverse_transform(self, X=None, *, Xt=None):
|
||||
"""
|
||||
Inverse the transformation and return a vector of size `n_features`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The values to be assigned to each cluster of samples.
|
||||
|
||||
Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,)
|
||||
The values to be assigned to each cluster of samples.
|
||||
|
||||
.. deprecated:: 1.5
|
||||
`Xt` was deprecated in 1.5 and will be removed in 1.7. Use `X` instead.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_features) or (n_features,)
|
||||
A vector of size `n_samples` with the values of `Xred` assigned to
|
||||
each of the cluster of samples.
|
||||
"""
|
||||
X = _deprecate_Xt_in_inverse_transform(X, Xt)
|
||||
|
||||
check_is_fitted(self)
|
||||
|
||||
unil, inverse = np.unique(self.labels_, return_inverse=True)
|
||||
return X[..., inverse]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,272 @@
|
||||
# Minimum spanning tree single linkage implementation for hdbscan
|
||||
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
|
||||
# Steve Astels <sastels@gmail.com>
|
||||
# Meekail Zain <zainmeekail@gmail.com>
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.float cimport DBL_MAX
|
||||
|
||||
import numpy as np
|
||||
from ...metrics._dist_metrics cimport DistanceMetric64
|
||||
from ...cluster._hierarchical_fast cimport UnionFind
|
||||
from ...cluster._hdbscan._tree cimport HIERARCHY_t
|
||||
from ...cluster._hdbscan._tree import HIERARCHY_dtype
|
||||
from ...utils._typedefs cimport intp_t, float64_t, int64_t, uint8_t
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
# Numpy structured dtype representing a single ordered edge in Prim's algorithm
|
||||
MST_edge_dtype = np.dtype([
|
||||
("current_node", np.int64),
|
||||
("next_node", np.int64),
|
||||
("distance", np.float64),
|
||||
])
|
||||
|
||||
# Packed shouldn't make a difference since they're all 8-byte quantities,
|
||||
# but it's included just to be safe.
|
||||
ctypedef packed struct MST_edge_t:
|
||||
int64_t current_node
|
||||
int64_t next_node
|
||||
float64_t distance
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_mutual_reachability(
|
||||
cnp.ndarray[float64_t, ndim=2] mutual_reachability
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mutual_reachability : ndarray of shape (n_samples, n_samples)
|
||||
Array of mutual-reachabilities between samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
"""
|
||||
cdef:
|
||||
# Note: we utilize ndarray's over memory-views to make use of numpy
|
||||
# binary indexing and sub-selection below.
|
||||
cnp.ndarray[int64_t, ndim=1, mode='c'] current_labels
|
||||
cnp.ndarray[float64_t, ndim=1, mode='c'] min_reachability, left, right
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
cnp.ndarray[uint8_t, mode='c'] label_filter
|
||||
|
||||
int64_t n_samples = PyArray_SHAPE(<cnp.PyArrayObject*> mutual_reachability)[0]
|
||||
int64_t current_node, new_node_index, new_node, i
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
current_labels = np.arange(n_samples, dtype=np.int64)
|
||||
current_node = 0
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
for i in range(0, n_samples - 1):
|
||||
label_filter = current_labels != current_node
|
||||
current_labels = current_labels[label_filter]
|
||||
left = min_reachability[label_filter]
|
||||
right = mutual_reachability[current_node][current_labels]
|
||||
min_reachability = np.minimum(left, right)
|
||||
|
||||
new_node_index = np.argmin(min_reachability)
|
||||
new_node = current_labels[new_node_index]
|
||||
mst[i].current_node = current_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = min_reachability[new_node_index]
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
|
||||
cpdef cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst_from_data_matrix(
|
||||
const float64_t[:, ::1] raw_data,
|
||||
const float64_t[::1] core_distances,
|
||||
DistanceMetric64 dist_metric,
|
||||
float64_t alpha=1.0
|
||||
):
|
||||
"""Compute the Minimum Spanning Tree (MST) representation of the mutual-
|
||||
reachability graph generated from the provided `raw_data` and
|
||||
`core_distances` using Prim's algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_data : ndarray of shape (n_samples, n_features)
|
||||
Input array of data samples.
|
||||
|
||||
core_distances : ndarray of shape (n_samples,)
|
||||
An array containing the core-distance calculated for each corresponding
|
||||
sample.
|
||||
|
||||
dist_metric : DistanceMetric
|
||||
The distance metric to use when calculating pairwise distances for
|
||||
determining mutual-reachability.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
uint8_t[::1] in_tree
|
||||
float64_t[::1] min_reachability
|
||||
int64_t[::1] current_sources
|
||||
cnp.ndarray[MST_edge_t, ndim=1, mode='c'] mst
|
||||
|
||||
int64_t current_node, source_node, new_node, next_node_source
|
||||
int64_t i, j, n_samples, num_features
|
||||
|
||||
float64_t current_node_core_dist, new_reachability, mutual_reachability_distance
|
||||
float64_t next_node_min_reach, pair_distance, next_node_core_dist
|
||||
|
||||
n_samples = raw_data.shape[0]
|
||||
num_features = raw_data.shape[1]
|
||||
|
||||
mst = np.empty(n_samples - 1, dtype=MST_edge_dtype)
|
||||
|
||||
in_tree = np.zeros(n_samples, dtype=np.uint8)
|
||||
min_reachability = np.full(n_samples, fill_value=np.inf, dtype=np.float64)
|
||||
current_sources = np.ones(n_samples, dtype=np.int64)
|
||||
|
||||
current_node = 0
|
||||
|
||||
for i in range(0, n_samples - 1):
|
||||
|
||||
in_tree[current_node] = 1
|
||||
|
||||
current_node_core_dist = core_distances[current_node]
|
||||
|
||||
new_reachability = DBL_MAX
|
||||
source_node = 0
|
||||
new_node = 0
|
||||
|
||||
for j in range(n_samples):
|
||||
if in_tree[j]:
|
||||
continue
|
||||
|
||||
next_node_min_reach = min_reachability[j]
|
||||
next_node_source = current_sources[j]
|
||||
|
||||
pair_distance = dist_metric.dist(
|
||||
&raw_data[current_node, 0],
|
||||
&raw_data[j, 0],
|
||||
num_features
|
||||
)
|
||||
|
||||
pair_distance /= alpha
|
||||
|
||||
next_node_core_dist = core_distances[j]
|
||||
mutual_reachability_distance = max(
|
||||
current_node_core_dist,
|
||||
next_node_core_dist,
|
||||
pair_distance
|
||||
)
|
||||
if mutual_reachability_distance > next_node_min_reach:
|
||||
if next_node_min_reach < new_reachability:
|
||||
new_reachability = next_node_min_reach
|
||||
source_node = next_node_source
|
||||
new_node = j
|
||||
continue
|
||||
|
||||
if mutual_reachability_distance < next_node_min_reach:
|
||||
min_reachability[j] = mutual_reachability_distance
|
||||
current_sources[j] = current_node
|
||||
if mutual_reachability_distance < new_reachability:
|
||||
new_reachability = mutual_reachability_distance
|
||||
source_node = current_node
|
||||
new_node = j
|
||||
else:
|
||||
if next_node_min_reach < new_reachability:
|
||||
new_reachability = next_node_min_reach
|
||||
source_node = next_node_source
|
||||
new_node = j
|
||||
|
||||
mst[i].current_node = source_node
|
||||
mst[i].next_node = new_node
|
||||
mst[i].distance = new_reachability
|
||||
current_node = new_node
|
||||
|
||||
return mst
|
||||
|
||||
cpdef cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] make_single_linkage(const MST_edge_t[::1] mst):
|
||||
"""Construct a single-linkage tree from an MST.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mst : ndarray of shape (n_samples - 1,), dtype=MST_edge_dtype
|
||||
The MST representation of the mutual-reahability graph. The MST is
|
||||
represented as a collecteion of edges.
|
||||
|
||||
Returns
|
||||
-------
|
||||
single_linkage : ndarray of shape (n_samples - 1,), dtype=HIERARCHY_dtype
|
||||
The single-linkage tree tree (dendrogram) built from the MST. Each
|
||||
of the array represents the following:
|
||||
|
||||
- left node/cluster
|
||||
- right node/cluster
|
||||
- distance
|
||||
- new cluster size
|
||||
"""
|
||||
cdef:
|
||||
cnp.ndarray[HIERARCHY_t, ndim=1, mode="c"] single_linkage
|
||||
|
||||
# Note mst.shape[0] is one fewer than the number of samples
|
||||
int64_t n_samples = mst.shape[0] + 1
|
||||
intp_t current_node_cluster, next_node_cluster
|
||||
int64_t current_node, next_node, i
|
||||
float64_t distance
|
||||
UnionFind U = UnionFind(n_samples)
|
||||
|
||||
single_linkage = np.zeros(n_samples - 1, dtype=HIERARCHY_dtype)
|
||||
|
||||
for i in range(n_samples - 1):
|
||||
|
||||
current_node = mst[i].current_node
|
||||
next_node = mst[i].next_node
|
||||
distance = mst[i].distance
|
||||
|
||||
current_node_cluster = U.fast_find(current_node)
|
||||
next_node_cluster = U.fast_find(next_node)
|
||||
|
||||
single_linkage[i].left_node = current_node_cluster
|
||||
single_linkage[i].right_node = next_node_cluster
|
||||
single_linkage[i].value = distance
|
||||
single_linkage[i].cluster_size = U.size[current_node_cluster] + U.size[next_node_cluster]
|
||||
|
||||
U.union(current_node_cluster, next_node_cluster)
|
||||
|
||||
return single_linkage
|
||||
Binary file not shown.
@@ -0,0 +1,212 @@
|
||||
# mutual reachability distance computations
|
||||
# Authors: Leland McInnes <leland.mcinnes@gmail.com>
|
||||
# Meekail Zain <zainmeekail@gmail.com>
|
||||
# Guillaume Lemaitre <g.lemaitre58@gmail.com>
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cimport numpy as cnp
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from cython cimport floating, integral
|
||||
from libc.math cimport isfinite, INFINITY
|
||||
from ...utils._typedefs cimport intp_t
|
||||
cnp.import_array()
|
||||
|
||||
|
||||
def mutual_reachability_graph(
|
||||
distance_matrix, min_samples=5, max_distance=0.0
|
||||
):
|
||||
"""Compute the weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
The mutual reachability distance used to build the graph is defined as::
|
||||
|
||||
max(d_core(x_p), d_core(x_q), d(x_p, x_q))
|
||||
|
||||
and the core distance `d_core` is defined as the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
Note that all computations are done in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : {ndarray, sparse matrix} of shape (n_samples, n_samples)
|
||||
Array of distances between samples. If sparse, the array must be in
|
||||
`CSR` format.
|
||||
|
||||
min_samples : int, default=5
|
||||
The parameter `k` used to calculate the distance between a point
|
||||
`x_p` and its k-th nearest neighbor.
|
||||
|
||||
max_distance : float, default=0.0
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mututal_reachability_graph: {ndarray, sparse matrix} of shape \
|
||||
(n_samples, n_samples)
|
||||
Weighted adjacency matrix of the mutual reachability graph.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
|
||||
Density-based clustering based on hierarchical density estimates.
|
||||
In Pacific-Asia Conference on Knowledge Discovery and Data Mining
|
||||
(pp. 160-172). Springer Berlin Heidelberg.
|
||||
"""
|
||||
further_neighbor_idx = min_samples - 1
|
||||
if issparse(distance_matrix):
|
||||
if distance_matrix.format != "csr":
|
||||
raise ValueError(
|
||||
"Only sparse CSR matrices are supported for `distance_matrix`."
|
||||
)
|
||||
_sparse_mutual_reachability_graph(
|
||||
distance_matrix.data,
|
||||
distance_matrix.indices,
|
||||
distance_matrix.indptr,
|
||||
distance_matrix.shape[0],
|
||||
further_neighbor_idx=further_neighbor_idx,
|
||||
max_distance=max_distance,
|
||||
)
|
||||
else:
|
||||
_dense_mutual_reachability_graph(
|
||||
distance_matrix, further_neighbor_idx=further_neighbor_idx
|
||||
)
|
||||
return distance_matrix
|
||||
|
||||
|
||||
def _dense_mutual_reachability_graph(
|
||||
floating[:, :] distance_matrix,
|
||||
intp_t further_neighbor_idx,
|
||||
):
|
||||
"""Dense implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : ndarray of shape (n_samples, n_samples)
|
||||
Array of distances between samples.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
"""
|
||||
cdef:
|
||||
intp_t i, j, n_samples = distance_matrix.shape[0]
|
||||
floating mutual_reachibility_distance
|
||||
floating[::1] core_distances
|
||||
|
||||
# We assume that the distance matrix is symmetric. We choose to sort every
|
||||
# row to have the same implementation than the sparse case that requires
|
||||
# CSR matrix.
|
||||
core_distances = np.ascontiguousarray(
|
||||
np.partition(
|
||||
distance_matrix, further_neighbor_idx, axis=1
|
||||
)[:, further_neighbor_idx]
|
||||
)
|
||||
|
||||
with nogil:
|
||||
# TODO: Update w/ prange with thread count based on
|
||||
# _openmp_effective_n_threads
|
||||
for i in range(n_samples):
|
||||
for j in range(n_samples):
|
||||
mutual_reachibility_distance = max(
|
||||
core_distances[i],
|
||||
core_distances[j],
|
||||
distance_matrix[i, j],
|
||||
)
|
||||
distance_matrix[i, j] = mutual_reachibility_distance
|
||||
|
||||
|
||||
def _sparse_mutual_reachability_graph(
|
||||
cnp.ndarray[floating, ndim=1, mode="c"] data,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indices,
|
||||
cnp.ndarray[integral, ndim=1, mode="c"] indptr,
|
||||
intp_t n_samples,
|
||||
intp_t further_neighbor_idx,
|
||||
floating max_distance,
|
||||
):
|
||||
"""Sparse implementation of mutual reachability graph.
|
||||
|
||||
The computation is done in-place, i.e. the distance matrix is modified
|
||||
directly. This implementation only accepts `CSR` format sparse matrices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distance_matrix : sparse matrix of shape (n_samples, n_samples)
|
||||
Sparse matrix of distances between samples. The sparse format should
|
||||
be `CSR`.
|
||||
|
||||
further_neighbor_idx : int
|
||||
The index of the furthest neighbor to use to define the core distances.
|
||||
|
||||
max_distance : float
|
||||
The distance which `np.inf` is replaced with. When the true mutual-
|
||||
reachability distance is measured to be infinite, it is instead
|
||||
truncated to `max_dist`. Only used when `distance_matrix` is a sparse
|
||||
matrix.
|
||||
"""
|
||||
cdef:
|
||||
integral i, col_ind, row_ind
|
||||
floating mutual_reachibility_distance
|
||||
floating[:] core_distances
|
||||
floating[:] row_data
|
||||
|
||||
if floating is float:
|
||||
dtype = np.float32
|
||||
else:
|
||||
dtype = np.float64
|
||||
|
||||
core_distances = np.empty(n_samples, dtype=dtype)
|
||||
|
||||
for i in range(n_samples):
|
||||
row_data = data[indptr[i]:indptr[i + 1]]
|
||||
if further_neighbor_idx < row_data.size:
|
||||
core_distances[i] = np.partition(
|
||||
row_data, further_neighbor_idx
|
||||
)[further_neighbor_idx]
|
||||
else:
|
||||
core_distances[i] = INFINITY
|
||||
|
||||
with nogil:
|
||||
for row_ind in range(n_samples):
|
||||
for i in range(indptr[row_ind], indptr[row_ind + 1]):
|
||||
col_ind = indices[i]
|
||||
mutual_reachibility_distance = max(
|
||||
core_distances[row_ind], core_distances[col_ind], data[i]
|
||||
)
|
||||
if isfinite(mutual_reachibility_distance):
|
||||
data[i] = mutual_reachibility_distance
|
||||
elif max_distance > 0:
|
||||
data[i] = max_distance
|
||||
Binary file not shown.
@@ -0,0 +1,49 @@
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from ...utils._typedefs cimport intp_t, float64_t, uint8_t
|
||||
cimport numpy as cnp
|
||||
|
||||
# This corresponds to the scipy.cluster.hierarchy format
|
||||
ctypedef packed struct HIERARCHY_t:
|
||||
intp_t left_node
|
||||
intp_t right_node
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
# Effectively an edgelist encoding a parent/child pair, along with a value and
|
||||
# the corresponding cluster_size in each row providing a tree structure.
|
||||
ctypedef packed struct CONDENSED_t:
|
||||
intp_t parent
|
||||
intp_t child
|
||||
float64_t value
|
||||
intp_t cluster_size
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
@@ -0,0 +1,799 @@
|
||||
# Tree handling (condensing, finding stable clusters) for hdbscan
|
||||
# Authors: Leland McInnes
|
||||
# Copyright (c) 2015, Leland McInnes
|
||||
# All rights reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
|
||||
# 1. Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
|
||||
# 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
# may be used to endorse or promote products derived from this software without
|
||||
# specific prior written permission.
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
cimport numpy as cnp
|
||||
from libc.math cimport isinf
|
||||
import cython
|
||||
|
||||
import numpy as np
|
||||
|
||||
cnp.import_array()
|
||||
|
||||
cdef extern from "numpy/arrayobject.h":
|
||||
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
|
||||
|
||||
cdef cnp.float64_t INFTY = np.inf
|
||||
cdef cnp.intp_t NOISE = -1
|
||||
|
||||
HIERARCHY_dtype = np.dtype([
|
||||
("left_node", np.intp),
|
||||
("right_node", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
CONDENSED_dtype = np.dtype([
|
||||
("parent", np.intp),
|
||||
("child", np.intp),
|
||||
("value", np.float64),
|
||||
("cluster_size", np.intp),
|
||||
])
|
||||
|
||||
cpdef tuple tree_to_labels(
|
||||
const HIERARCHY_t[::1] single_linkage_tree,
|
||||
cnp.intp_t min_cluster_size=10,
|
||||
cluster_selection_method="eom",
|
||||
bint allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None,
|
||||
):
|
||||
cdef:
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probabilities
|
||||
|
||||
condensed_tree = _condense_tree(single_linkage_tree, min_cluster_size)
|
||||
labels, probabilities = _get_clusters(
|
||||
condensed_tree,
|
||||
_compute_stability(condensed_tree),
|
||||
cluster_selection_method,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon,
|
||||
max_cluster_size,
|
||||
)
|
||||
|
||||
return (labels, probabilities)
|
||||
|
||||
cdef list bfs_from_hierarchy(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
"""
|
||||
Perform a breadth first search on a tree in scipy hclust format.
|
||||
"""
|
||||
|
||||
cdef list process_queue, next_queue, result
|
||||
cdef cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cdef cnp.intp_t node
|
||||
process_queue = [bfs_root]
|
||||
result = []
|
||||
|
||||
while process_queue:
|
||||
result.extend(process_queue)
|
||||
# By construction, node i is formed by the union of nodes
|
||||
# hierarchy[i - n_samples, 0] and hierarchy[i - n_samples, 1]
|
||||
process_queue = [
|
||||
x - n_samples
|
||||
for x in process_queue
|
||||
if x >= n_samples
|
||||
]
|
||||
if process_queue:
|
||||
next_queue = []
|
||||
for node in process_queue:
|
||||
next_queue.extend(
|
||||
[
|
||||
hierarchy[node].left_node,
|
||||
hierarchy[node].right_node,
|
||||
]
|
||||
)
|
||||
process_queue = next_queue
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] _condense_tree(
|
||||
const HIERARCHY_t[::1] hierarchy,
|
||||
cnp.intp_t min_cluster_size=10
|
||||
):
|
||||
"""Condense a tree according to a minimum cluster size. This is akin
|
||||
to the runt pruning procedure of Stuetzle. The result is a much simpler
|
||||
tree that is easier to visualize. We include extra information on the
|
||||
lambda value at which individual points depart clusters for later
|
||||
analysis and computation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
hierarchy : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
A single linkage hierarchy in scipy.cluster.hierarchy format.
|
||||
|
||||
min_cluster_size : int, optional (default 10)
|
||||
The minimum size of clusters to consider. Clusters smaller than this
|
||||
are pruned from the tree.
|
||||
|
||||
Returns
|
||||
-------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root = 2 * hierarchy.shape[0]
|
||||
cnp.intp_t n_samples = hierarchy.shape[0] + 1
|
||||
cnp.intp_t next_label = n_samples + 1
|
||||
list result_list, node_list = bfs_from_hierarchy(hierarchy, root)
|
||||
|
||||
cnp.intp_t[::1] relabel
|
||||
cnp.uint8_t[::1] ignore
|
||||
|
||||
cnp.intp_t node, sub_node, left, right
|
||||
cnp.float64_t lambda_value, distance
|
||||
cnp.intp_t left_count, right_count
|
||||
HIERARCHY_t children
|
||||
|
||||
relabel = np.empty(root + 1, dtype=np.intp)
|
||||
relabel[root] = n_samples
|
||||
result_list = []
|
||||
ignore = np.zeros(len(node_list), dtype=bool)
|
||||
|
||||
for node in node_list:
|
||||
if ignore[node] or node < n_samples:
|
||||
continue
|
||||
|
||||
children = hierarchy[node - n_samples]
|
||||
left = children.left_node
|
||||
right = children.right_node
|
||||
distance = children.value
|
||||
if distance > 0.0:
|
||||
lambda_value = 1.0 / distance
|
||||
else:
|
||||
lambda_value = INFTY
|
||||
|
||||
if left >= n_samples:
|
||||
left_count = hierarchy[left - n_samples].cluster_size
|
||||
else:
|
||||
left_count = 1
|
||||
|
||||
if right >= n_samples:
|
||||
right_count = <cnp.intp_t> hierarchy[right - n_samples].cluster_size
|
||||
else:
|
||||
right_count = 1
|
||||
|
||||
if left_count >= min_cluster_size and right_count >= min_cluster_size:
|
||||
relabel[left] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[left], lambda_value, left_count)
|
||||
)
|
||||
|
||||
relabel[right] = next_label
|
||||
next_label += 1
|
||||
result_list.append(
|
||||
(relabel[node], relabel[right], lambda_value, right_count)
|
||||
)
|
||||
|
||||
elif left_count < min_cluster_size and right_count < min_cluster_size:
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
elif left_count < min_cluster_size:
|
||||
relabel[right] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, left):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
else:
|
||||
relabel[left] = relabel[node]
|
||||
for sub_node in bfs_from_hierarchy(hierarchy, right):
|
||||
if sub_node < n_samples:
|
||||
result_list.append(
|
||||
(relabel[node], sub_node, lambda_value, 1)
|
||||
)
|
||||
ignore[sub_node] = True
|
||||
|
||||
return np.array(result_list, dtype=CONDENSED_dtype)
|
||||
|
||||
|
||||
cdef dict _compute_stability(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.float64_t[::1] result, births
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
cnp.intp_t parent, cluster_size, result_index, idx
|
||||
cnp.float64_t lambda_val
|
||||
CONDENSED_t condensed_node
|
||||
cnp.intp_t largest_child = condensed_tree['child'].max()
|
||||
cnp.intp_t smallest_cluster = np.min(parents)
|
||||
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
|
||||
dict stability_dict = {}
|
||||
|
||||
largest_child = max(largest_child, smallest_cluster)
|
||||
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
|
||||
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
births[condensed_node.child] = condensed_node.value
|
||||
|
||||
births[smallest_cluster] = 0.0
|
||||
|
||||
result = np.zeros(num_clusters, dtype=np.float64)
|
||||
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
condensed_node = condensed_tree[idx]
|
||||
parent = condensed_node.parent
|
||||
lambda_val = condensed_node.value
|
||||
cluster_size = condensed_node.cluster_size
|
||||
|
||||
result_index = parent - smallest_cluster
|
||||
result[result_index] += (lambda_val - births[parent]) * cluster_size
|
||||
|
||||
for idx in range(num_clusters):
|
||||
stability_dict[idx + smallest_cluster] = result[idx]
|
||||
|
||||
return stability_dict
|
||||
|
||||
|
||||
cdef list bfs_from_cluster_tree(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
cnp.intp_t bfs_root
|
||||
):
|
||||
|
||||
cdef:
|
||||
list result = []
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] process_queue = (
|
||||
np.array([bfs_root], dtype=np.intp)
|
||||
)
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = condensed_tree['child']
|
||||
cnp.intp_t[:] parents = condensed_tree['parent']
|
||||
|
||||
while len(process_queue) > 0:
|
||||
result.extend(process_queue.tolist())
|
||||
process_queue = children[np.isin(parents, process_queue)]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.float64_t[::1] max_lambdas(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree):
|
||||
|
||||
cdef:
|
||||
cnp.intp_t parent, current_parent, idx
|
||||
cnp.float64_t lambda_val, max_lambda
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t largest_parent = condensed_tree['parent'].max()
|
||||
|
||||
deaths = np.zeros(largest_parent + 1, dtype=np.float64)
|
||||
current_parent = condensed_tree[0].parent
|
||||
max_lambda = condensed_tree[0].value
|
||||
|
||||
for idx in range(1, PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
parent = condensed_tree[idx].parent
|
||||
lambda_val = condensed_tree[idx].value
|
||||
|
||||
if parent == current_parent:
|
||||
max_lambda = max(max_lambda, lambda_val)
|
||||
else:
|
||||
deaths[current_parent] = max_lambda
|
||||
current_parent = parent
|
||||
max_lambda = lambda_val
|
||||
|
||||
deaths[current_parent] = max_lambda # value for last parent
|
||||
return deaths
|
||||
|
||||
|
||||
@cython.final
|
||||
cdef class TreeUnionFind:
|
||||
|
||||
cdef cnp.intp_t[:, ::1] data
|
||||
cdef cnp.uint8_t[::1] is_component
|
||||
|
||||
def __init__(self, size):
|
||||
cdef cnp.intp_t idx
|
||||
self.data = np.zeros((size, 2), dtype=np.intp)
|
||||
for idx in range(size):
|
||||
self.data[idx, 0] = idx
|
||||
self.is_component = np.ones(size, dtype=np.uint8)
|
||||
|
||||
cdef void union(self, cnp.intp_t x, cnp.intp_t y):
|
||||
cdef cnp.intp_t x_root = self.find(x)
|
||||
cdef cnp.intp_t y_root = self.find(y)
|
||||
|
||||
if self.data[x_root, 1] < self.data[y_root, 1]:
|
||||
self.data[x_root, 0] = y_root
|
||||
elif self.data[x_root, 1] > self.data[y_root, 1]:
|
||||
self.data[y_root, 0] = x_root
|
||||
else:
|
||||
self.data[y_root, 0] = x_root
|
||||
self.data[x_root, 1] += 1
|
||||
return
|
||||
|
||||
cdef cnp.intp_t find(self, cnp.intp_t x):
|
||||
if self.data[x, 0] != x:
|
||||
self.data[x, 0] = self.find(self.data[x, 0])
|
||||
self.is_component[x] = False
|
||||
return self.data[x, 0]
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labelling_at_cut(
|
||||
const HIERARCHY_t[::1] linkage,
|
||||
cnp.float64_t cut,
|
||||
cnp.intp_t min_cluster_size
|
||||
):
|
||||
"""Given a single linkage tree and a cut value, return the
|
||||
vector of cluster labels at that cut value. This is useful
|
||||
for Robust Single Linkage, and extracting DBSCAN results
|
||||
from a single HDBSCAN run.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
linkage : ndarray of shape (n_samples,), dtype=HIERARCHY_dtype
|
||||
The single linkage tree in scipy.cluster.hierarchy format.
|
||||
|
||||
cut : double
|
||||
The cut value at which to find clusters.
|
||||
|
||||
min_cluster_size : int
|
||||
The minimum cluster size; clusters below this size at
|
||||
the cut will be considered noise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t n, cluster, root, n_samples, cluster_label
|
||||
cnp.intp_t[::1] unique_labels, cluster_size
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
TreeUnionFind union_find
|
||||
dict cluster_label_map
|
||||
HIERARCHY_t node
|
||||
|
||||
root = 2 * linkage.shape[0]
|
||||
n_samples = root // 2 + 1
|
||||
result = np.empty(n_samples, dtype=np.intp)
|
||||
union_find = TreeUnionFind(root + 1)
|
||||
|
||||
cluster = n_samples
|
||||
for node in linkage:
|
||||
if node.value < cut:
|
||||
union_find.union(node.left_node, cluster)
|
||||
union_find.union(node.right_node, cluster)
|
||||
cluster += 1
|
||||
|
||||
cluster_size = np.zeros(cluster, dtype=np.intp)
|
||||
for n in range(n_samples):
|
||||
cluster = union_find.find(n)
|
||||
cluster_size[cluster] += 1
|
||||
result[n] = cluster
|
||||
|
||||
cluster_label_map = {-1: NOISE}
|
||||
cluster_label = 0
|
||||
unique_labels = np.unique(result)
|
||||
|
||||
for cluster in unique_labels:
|
||||
if cluster_size[cluster] < min_cluster_size:
|
||||
cluster_label_map[cluster] = NOISE
|
||||
else:
|
||||
cluster_label_map[cluster] = cluster_label
|
||||
cluster_label += 1
|
||||
|
||||
for n in range(n_samples):
|
||||
result[n] = cluster_label_map[result[n]]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] _do_labelling(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
set clusters,
|
||||
dict cluster_label_map,
|
||||
cnp.intp_t allow_single_cluster,
|
||||
cnp.float64_t cluster_selection_epsilon
|
||||
):
|
||||
"""Given a condensed tree, clusters and a labeling map for the clusters,
|
||||
return an array containing the labels of each point based on cluster
|
||||
membership. Note that this is where points may be marked as noisy
|
||||
outliers. The determination of some points as noise is in large, single-
|
||||
cluster datasets is controlled by the `allow_single_cluster` and
|
||||
`cluster_selection_epsilon` parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
clusters : set
|
||||
The set of nodes corresponding to identified clusters. These node
|
||||
values should be the same as those present in `condensed_tree`.
|
||||
|
||||
cluster_label_map : dict
|
||||
A mapping from the node values present in `clusters` to the labels
|
||||
which will be returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
The cluster labels for each point in the data set;
|
||||
a label of -1 denotes a noise assignment.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
cnp.intp_t root_cluster
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] result
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] parent_array, child_array
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] lambda_array
|
||||
TreeUnionFind union_find
|
||||
cnp.intp_t n, parent, child, cluster
|
||||
cnp.float64_t threshold
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
root_cluster = np.min(parent_array)
|
||||
result = np.empty(root_cluster, dtype=np.intp)
|
||||
union_find = TreeUnionFind(np.max(parent_array) + 1)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
child = child_array[n]
|
||||
parent = parent_array[n]
|
||||
if child not in clusters:
|
||||
union_find.union(parent, child)
|
||||
|
||||
for n in range(root_cluster):
|
||||
cluster = union_find.find(n)
|
||||
label = NOISE
|
||||
if cluster != root_cluster:
|
||||
label = cluster_label_map[cluster]
|
||||
elif len(clusters) == 1 and allow_single_cluster:
|
||||
# There can only be one edge with this particular child hence this
|
||||
# expression extracts a unique, scalar lambda value.
|
||||
parent_lambda = lambda_array[child_array == n]
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
threshold = 1 / cluster_selection_epsilon
|
||||
else:
|
||||
# The threshold should be calculated per-sample based on the
|
||||
# largest lambda of any simbling node.
|
||||
threshold = lambda_array[parent_array == cluster].max()
|
||||
if parent_lambda >= threshold:
|
||||
label = cluster_label_map[cluster]
|
||||
|
||||
result[n] = label
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cdef cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] get_probabilities(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict cluster_map,
|
||||
cnp.intp_t[::1] labels
|
||||
):
|
||||
|
||||
cdef:
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] result
|
||||
cnp.float64_t[:] lambda_array
|
||||
cnp.float64_t[::1] deaths
|
||||
cnp.intp_t[:] child_array, parent_array
|
||||
cnp.intp_t root_cluster, n, point, cluster_num, cluster
|
||||
cnp.float64_t max_lambda, lambda_val
|
||||
|
||||
child_array = condensed_tree['child']
|
||||
parent_array = condensed_tree['parent']
|
||||
lambda_array = condensed_tree['value']
|
||||
|
||||
result = np.zeros(labels.shape[0])
|
||||
deaths = max_lambdas(condensed_tree)
|
||||
root_cluster = np.min(parent_array)
|
||||
|
||||
for n in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
|
||||
point = child_array[n]
|
||||
if point >= root_cluster:
|
||||
continue
|
||||
|
||||
cluster_num = labels[point]
|
||||
if cluster_num == -1:
|
||||
continue
|
||||
|
||||
cluster = cluster_map[cluster_num]
|
||||
max_lambda = deaths[cluster]
|
||||
if max_lambda == 0.0 or isinf(lambda_array[n]):
|
||||
result[point] = 1.0
|
||||
else:
|
||||
lambda_val = min(lambda_array[n], max_lambda)
|
||||
result[point] = lambda_val / max_lambda
|
||||
|
||||
return result
|
||||
|
||||
|
||||
cpdef list recurse_leaf_dfs(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.intp_t current_node
|
||||
):
|
||||
cdef cnp.intp_t[:] children
|
||||
cdef cnp.intp_t child
|
||||
|
||||
children = cluster_tree[cluster_tree['parent'] == current_node]['child']
|
||||
if children.shape[0] == 0:
|
||||
return [current_node,]
|
||||
else:
|
||||
return sum([recurse_leaf_dfs(cluster_tree, child) for child in children], [])
|
||||
|
||||
|
||||
cpdef list get_cluster_tree_leaves(cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree):
|
||||
cdef cnp.intp_t root
|
||||
if PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] == 0:
|
||||
return []
|
||||
root = cluster_tree['parent'].min()
|
||||
return recurse_leaf_dfs(cluster_tree, root)
|
||||
|
||||
cdef cnp.intp_t traverse_upwards(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t leaf,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef cnp.intp_t root, parent
|
||||
cdef cnp.float64_t parent_eps
|
||||
|
||||
root = cluster_tree['parent'].min()
|
||||
parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
|
||||
if parent == root:
|
||||
if allow_single_cluster:
|
||||
return parent
|
||||
else:
|
||||
return leaf # return node closest to root
|
||||
|
||||
parent_eps = 1 / cluster_tree[cluster_tree['child'] == parent]['value']
|
||||
if parent_eps > cluster_selection_epsilon:
|
||||
return parent
|
||||
else:
|
||||
return traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
parent,
|
||||
allow_single_cluster
|
||||
)
|
||||
|
||||
cdef set epsilon_search(
|
||||
set leaves,
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree,
|
||||
cnp.float64_t cluster_selection_epsilon,
|
||||
cnp.intp_t allow_single_cluster
|
||||
):
|
||||
cdef:
|
||||
list selected_clusters = list()
|
||||
list processed = list()
|
||||
cnp.intp_t leaf, epsilon_child, sub_node
|
||||
cnp.float64_t eps
|
||||
cnp.uint8_t[:] leaf_nodes
|
||||
cnp.ndarray[cnp.intp_t, ndim=1] children = cluster_tree['child']
|
||||
cnp.ndarray[cnp.float64_t, ndim=1] distances = cluster_tree['value']
|
||||
|
||||
for leaf in leaves:
|
||||
leaf_nodes = children == leaf
|
||||
eps = 1 / distances[leaf_nodes][0]
|
||||
if eps < cluster_selection_epsilon:
|
||||
if leaf not in processed:
|
||||
epsilon_child = traverse_upwards(
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
leaf,
|
||||
allow_single_cluster
|
||||
)
|
||||
selected_clusters.append(epsilon_child)
|
||||
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, epsilon_child):
|
||||
if sub_node != epsilon_child:
|
||||
processed.append(sub_node)
|
||||
else:
|
||||
selected_clusters.append(leaf)
|
||||
|
||||
return set(selected_clusters)
|
||||
|
||||
|
||||
@cython.wraparound(True)
|
||||
cdef tuple _get_clusters(
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] condensed_tree,
|
||||
dict stability,
|
||||
cluster_selection_method='eom',
|
||||
cnp.uint8_t allow_single_cluster=False,
|
||||
cnp.float64_t cluster_selection_epsilon=0.0,
|
||||
max_cluster_size=None
|
||||
):
|
||||
"""Given a tree and stability dict, produce the cluster labels
|
||||
(and probabilities) for a flat clustering based on the chosen
|
||||
cluster selection method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
condensed_tree : ndarray of shape (n_samples,), dtype=CONDENSED_dtype
|
||||
Effectively an edgelist encoding a parent/child pair, along with a
|
||||
value and the corresponding cluster_size in each row providing a tree
|
||||
structure.
|
||||
|
||||
stability : dict
|
||||
A dictionary mapping cluster_ids to stability values
|
||||
|
||||
cluster_selection_method : string, optional (default 'eom')
|
||||
The method of selecting clusters. The default is the
|
||||
Excess of Mass algorithm specified by 'eom'. The alternate
|
||||
option is 'leaf'.
|
||||
|
||||
allow_single_cluster : boolean, optional (default False)
|
||||
Whether to allow a single cluster to be selected by the
|
||||
Excess of Mass algorithm.
|
||||
|
||||
cluster_selection_epsilon: double, optional (default 0.0)
|
||||
A distance threshold for cluster splits.
|
||||
|
||||
max_cluster_size: int, default=None
|
||||
The maximum size for clusters located by the EOM clusterer. Can
|
||||
be overridden by the cluster_selection_epsilon parameter in
|
||||
rare cases.
|
||||
|
||||
Returns
|
||||
-------
|
||||
labels : ndarray of shape (n_samples,)
|
||||
An integer array of cluster labels, with -1 denoting noise.
|
||||
|
||||
probabilities : ndarray (n_samples,)
|
||||
The cluster membership strength of each sample.
|
||||
|
||||
stabilities : ndarray (n_clusters,)
|
||||
The cluster coherence strengths of each cluster.
|
||||
"""
|
||||
cdef:
|
||||
list node_list
|
||||
cnp.ndarray[CONDENSED_t, ndim=1, mode='c'] cluster_tree
|
||||
cnp.uint8_t[::1] child_selection
|
||||
cnp.ndarray[cnp.intp_t, ndim=1, mode='c'] labels
|
||||
dict is_cluster, cluster_sizes
|
||||
cnp.float64_t subtree_stability
|
||||
cnp.intp_t node, sub_node, cluster, n_samples
|
||||
cnp.ndarray[cnp.float64_t, ndim=1, mode='c'] probs
|
||||
|
||||
# Assume clusters are ordered by numeric id equivalent to
|
||||
# a topological sort of the tree; This is valid given the
|
||||
# current implementation above, so don't change that ... or
|
||||
# if you do, change this accordingly!
|
||||
if allow_single_cluster:
|
||||
node_list = sorted(stability.keys(), reverse=True)
|
||||
else:
|
||||
node_list = sorted(stability.keys(), reverse=True)[:-1]
|
||||
# (exclude root)
|
||||
|
||||
cluster_tree = condensed_tree[condensed_tree['cluster_size'] > 1]
|
||||
is_cluster = {cluster: True for cluster in node_list}
|
||||
n_samples = np.max(condensed_tree[condensed_tree['cluster_size'] == 1]['child']) + 1
|
||||
|
||||
if max_cluster_size is None:
|
||||
max_cluster_size = n_samples + 1 # Set to a value that will never be triggered
|
||||
cluster_sizes = {
|
||||
child: cluster_size for child, cluster_size
|
||||
in zip(cluster_tree['child'], cluster_tree['cluster_size'])
|
||||
}
|
||||
if allow_single_cluster:
|
||||
# Compute cluster size for the root node
|
||||
cluster_sizes[node_list[-1]] = np.sum(
|
||||
cluster_tree[cluster_tree['parent'] == node_list[-1]]['cluster_size'])
|
||||
|
||||
if cluster_selection_method == 'eom':
|
||||
for node in node_list:
|
||||
child_selection = (cluster_tree['parent'] == node)
|
||||
subtree_stability = np.sum([
|
||||
stability[child] for
|
||||
child in cluster_tree['child'][child_selection]])
|
||||
if subtree_stability > stability[node] or cluster_sizes[node] > max_cluster_size:
|
||||
is_cluster[node] = False
|
||||
stability[node] = subtree_stability
|
||||
else:
|
||||
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
|
||||
if sub_node != node:
|
||||
is_cluster[sub_node] = False
|
||||
|
||||
if cluster_selection_epsilon != 0.0 and PyArray_SHAPE(<cnp.PyArrayObject*> cluster_tree)[0] > 0:
|
||||
eom_clusters = [c for c in is_cluster if is_cluster[c]]
|
||||
selected_clusters = []
|
||||
# first check if eom_clusters only has root node, which skips epsilon check.
|
||||
if (len(eom_clusters) == 1 and eom_clusters[0] == cluster_tree['parent'].min()):
|
||||
if allow_single_cluster:
|
||||
selected_clusters = eom_clusters
|
||||
else:
|
||||
selected_clusters = epsilon_search(
|
||||
set(eom_clusters),
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
elif cluster_selection_method == 'leaf':
|
||||
leaves = set(get_cluster_tree_leaves(cluster_tree))
|
||||
if len(leaves) == 0:
|
||||
for c in is_cluster:
|
||||
is_cluster[c] = False
|
||||
is_cluster[condensed_tree['parent'].min()] = True
|
||||
|
||||
if cluster_selection_epsilon != 0.0:
|
||||
selected_clusters = epsilon_search(
|
||||
leaves,
|
||||
cluster_tree,
|
||||
cluster_selection_epsilon,
|
||||
allow_single_cluster
|
||||
)
|
||||
else:
|
||||
selected_clusters = leaves
|
||||
|
||||
for c in is_cluster:
|
||||
if c in selected_clusters:
|
||||
is_cluster[c] = True
|
||||
else:
|
||||
is_cluster[c] = False
|
||||
|
||||
clusters = set([c for c in is_cluster if is_cluster[c]])
|
||||
cluster_map = {c: n for n, c in enumerate(sorted(list(clusters)))}
|
||||
reverse_cluster_map = {n: c for c, n in cluster_map.items()}
|
||||
|
||||
labels = _do_labelling(
|
||||
condensed_tree,
|
||||
clusters,
|
||||
cluster_map,
|
||||
allow_single_cluster,
|
||||
cluster_selection_epsilon
|
||||
)
|
||||
probs = get_probabilities(condensed_tree, reverse_cluster_map, labels)
|
||||
|
||||
return (labels, probs)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,16 @@
|
||||
cluster_hdbscan_extension_metadata = {
|
||||
'_linkage': {'sources': ['_linkage.pyx', metrics_cython_tree]},
|
||||
'_reachability': {'sources': ['_reachability.pyx']},
|
||||
'_tree': {'sources': ['_tree.pyx']}
|
||||
}
|
||||
|
||||
foreach ext_name, ext_dict : cluster_hdbscan_extension_metadata
|
||||
py.extension_module(
|
||||
ext_name,
|
||||
ext_dict.get('sources'),
|
||||
dependencies: [np_dep],
|
||||
cython_args: cython_args,
|
||||
subdir: 'sklearn/cluster/_hdbscan',
|
||||
install: true
|
||||
)
|
||||
endforeach
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,63 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
|
||||
from sklearn.utils._testing import (
|
||||
_convert_container,
|
||||
assert_allclose,
|
||||
)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_error_sparse_format():
|
||||
"""Check that we raise an error if the sparse format is not CSR."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, "sparse_csc")
|
||||
|
||||
err_msg = "Only sparse CSR matrices are supported"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
mutual_reachability_graph(X)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
def test_mutual_reachability_graph_inplace(array_type):
|
||||
"""Check that the operation is happening inplace."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = X.T @ X
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
|
||||
assert id(mr_graph) == id(X)
|
||||
|
||||
|
||||
def test_mutual_reachability_graph_equivalence_dense_sparse():
|
||||
"""Check that we get the same results for dense and sparse implementation."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(5, 5)
|
||||
X_dense = X.T @ X
|
||||
X_sparse = _convert_container(X_dense, "sparse_csr")
|
||||
|
||||
mr_graph_dense = mutual_reachability_graph(X_dense, min_samples=3)
|
||||
mr_graph_sparse = mutual_reachability_graph(X_sparse, min_samples=3)
|
||||
|
||||
assert_allclose(mr_graph_dense, mr_graph_sparse.toarray())
|
||||
|
||||
|
||||
@pytest.mark.parametrize("array_type", ["array", "sparse_csr"])
|
||||
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
|
||||
def test_mutual_reachability_graph_preserve_dtype(array_type, dtype):
|
||||
"""Check that the computation preserve dtype thanks to fused types."""
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 10)
|
||||
X = (X.T @ X).astype(dtype)
|
||||
np.fill_diagonal(X, 0.0)
|
||||
X = _convert_container(X, array_type)
|
||||
|
||||
assert X.dtype == dtype
|
||||
mr_graph = mutual_reachability_graph(X)
|
||||
assert mr_graph.dtype == dtype
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
from ..utils._typedefs cimport intp_t
|
||||
|
||||
cdef class UnionFind:
|
||||
cdef intp_t next_label
|
||||
cdef intp_t[:] parent
|
||||
cdef intp_t[:] size
|
||||
|
||||
cdef void union(self, intp_t m, intp_t n) noexcept
|
||||
cdef intp_t fast_find(self, intp_t n) noexcept
|
||||
@@ -0,0 +1,506 @@
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
|
||||
import numpy as np
|
||||
cimport cython
|
||||
|
||||
from ..metrics._dist_metrics cimport DistanceMetric64
|
||||
from ..utils._fast_dict cimport IntFloatDict
|
||||
from ..utils._typedefs cimport float64_t, intp_t, uint8_t
|
||||
|
||||
# C++
|
||||
from cython.operator cimport dereference as deref, preincrement as inc
|
||||
from libcpp.map cimport map as cpp_map
|
||||
from libc.math cimport fmax, INFINITY
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities for computing the ward momentum
|
||||
|
||||
def compute_ward_dist(
|
||||
const float64_t[::1] m_1,
|
||||
const float64_t[:, ::1] m_2,
|
||||
const intp_t[::1] coord_row,
|
||||
const intp_t[::1] coord_col,
|
||||
float64_t[::1] res
|
||||
):
|
||||
cdef intp_t size_max = coord_row.shape[0]
|
||||
cdef intp_t n_features = m_2.shape[1]
|
||||
cdef intp_t i, j, row, col
|
||||
cdef float64_t pa, n
|
||||
|
||||
for i in range(size_max):
|
||||
row = coord_row[i]
|
||||
col = coord_col[i]
|
||||
n = (m_1[row] * m_1[col]) / (m_1[row] + m_1[col])
|
||||
pa = 0.
|
||||
for j in range(n_features):
|
||||
pa += (m_2[row, j] / m_1[row] - m_2[col, j] / m_1[col]) ** 2
|
||||
res[i] = pa * n
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Utilities for cutting and exploring a hierarchical tree
|
||||
|
||||
def _hc_get_descendent(intp_t node, children, intp_t n_leaves):
|
||||
"""
|
||||
Function returning all the descendent leaves of a set of nodes in the tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
node : integer
|
||||
The node for which we want the descendents.
|
||||
|
||||
children : list of pairs, length n_nodes
|
||||
The children of each non-leaf node. Values less than `n_samples` refer
|
||||
to leaves of the tree. A greater value `i` indicates a node with
|
||||
children `children[i - n_samples]`.
|
||||
|
||||
n_leaves : integer
|
||||
Number of leaves.
|
||||
|
||||
Returns
|
||||
-------
|
||||
descendent : list of int
|
||||
"""
|
||||
ind = [node]
|
||||
if node < n_leaves:
|
||||
return ind
|
||||
descendent = []
|
||||
|
||||
# It is actually faster to do the accounting of the number of
|
||||
# elements is the list ourselves: len is a lengthy operation on a
|
||||
# chained list
|
||||
cdef intp_t i, n_indices = 1
|
||||
|
||||
while n_indices:
|
||||
i = ind.pop()
|
||||
if i < n_leaves:
|
||||
descendent.append(i)
|
||||
n_indices -= 1
|
||||
else:
|
||||
ind.extend(children[i - n_leaves])
|
||||
n_indices += 1
|
||||
return descendent
|
||||
|
||||
|
||||
def hc_get_heads(intp_t[:] parents, copy=True):
|
||||
"""Returns the heads of the forest, as defined by parents.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
parents : array of integers
|
||||
The parent structure defining the forest (ensemble of trees)
|
||||
copy : boolean
|
||||
If copy is False, the input 'parents' array is modified inplace
|
||||
|
||||
Returns
|
||||
-------
|
||||
heads : array of integers of same shape as parents
|
||||
The indices in the 'parents' of the tree heads
|
||||
|
||||
"""
|
||||
cdef intp_t parent, node0, node, size
|
||||
if copy:
|
||||
parents = np.copy(parents)
|
||||
size = parents.size
|
||||
|
||||
# Start from the top of the tree and go down
|
||||
for node0 in range(size - 1, -1, -1):
|
||||
node = node0
|
||||
parent = parents[node]
|
||||
while parent != node:
|
||||
parents[node0] = parent
|
||||
node = parent
|
||||
parent = parents[node]
|
||||
return parents
|
||||
|
||||
|
||||
def _get_parents(
|
||||
nodes,
|
||||
heads,
|
||||
const intp_t[:] parents,
|
||||
uint8_t[::1] not_visited
|
||||
):
|
||||
"""Returns the heads of the given nodes, as defined by parents.
|
||||
|
||||
Modifies 'heads' and 'not_visited' in-place.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nodes : list of integers
|
||||
The nodes to start from
|
||||
heads : list of integers
|
||||
A list to hold the results (modified inplace)
|
||||
parents : array of integers
|
||||
The parent structure defining the tree
|
||||
not_visited
|
||||
The tree nodes to consider (modified inplace)
|
||||
|
||||
"""
|
||||
cdef intp_t parent, node
|
||||
|
||||
for node in nodes:
|
||||
parent = parents[node]
|
||||
while parent != node:
|
||||
node = parent
|
||||
parent = parents[node]
|
||||
if not_visited[node]:
|
||||
not_visited[node] = 0
|
||||
heads.append(node)
|
||||
|
||||
|
||||
###############################################################################
|
||||
# merge strategies implemented on IntFloatDicts
|
||||
|
||||
# These are used in the hierarchical clustering code, to implement
|
||||
# merging between two clusters, defined as a dict containing node number
|
||||
# as keys and edge weights as values.
|
||||
|
||||
|
||||
def max_merge(
|
||||
IntFloatDict a,
|
||||
IntFloatDict b,
|
||||
const intp_t[:] mask,
|
||||
intp_t n_a,
|
||||
intp_t n_b
|
||||
):
|
||||
"""Merge two IntFloatDicts with the max strategy: when the same key is
|
||||
present in the two dicts, the max of the two values is used.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
a, b : IntFloatDict object
|
||||
The IntFloatDicts to merge
|
||||
mask : ndarray array of dtype integer and of dimension 1
|
||||
a mask for keys to ignore: if not mask[key] the corresponding key
|
||||
is skipped in the output dictionary
|
||||
n_a, n_b : float
|
||||
n_a and n_b are weights for a and b for the merge strategy.
|
||||
They are not used in the case of a max merge.
|
||||
|
||||
Returns
|
||||
=======
|
||||
out : IntFloatDict object
|
||||
The IntFloatDict resulting from the merge
|
||||
"""
|
||||
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||||
cdef intp_t key
|
||||
cdef float64_t value
|
||||
# First copy a into out
|
||||
while a_it != a_end:
|
||||
key = deref(a_it).first
|
||||
if mask[key]:
|
||||
out_obj.my_map[key] = deref(a_it).second
|
||||
inc(a_it)
|
||||
|
||||
# Then merge b into out
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||||
while b_it != b_end:
|
||||
key = deref(b_it).first
|
||||
value = deref(b_it).second
|
||||
if mask[key]:
|
||||
out_it = out_obj.my_map.find(key)
|
||||
if out_it == out_end:
|
||||
# Key not found
|
||||
out_obj.my_map[key] = value
|
||||
else:
|
||||
deref(out_it).second = fmax(deref(out_it).second, value)
|
||||
inc(b_it)
|
||||
return out_obj
|
||||
|
||||
|
||||
def average_merge(
|
||||
IntFloatDict a,
|
||||
IntFloatDict b,
|
||||
const intp_t[:] mask,
|
||||
intp_t n_a,
|
||||
intp_t n_b
|
||||
):
|
||||
"""Merge two IntFloatDicts with the average strategy: when the
|
||||
same key is present in the two dicts, the weighted average of the two
|
||||
values is used.
|
||||
|
||||
Parameters
|
||||
==========
|
||||
a, b : IntFloatDict object
|
||||
The IntFloatDicts to merge
|
||||
mask : ndarray array of dtype integer and of dimension 1
|
||||
a mask for keys to ignore: if not mask[key] the corresponding key
|
||||
is skipped in the output dictionary
|
||||
n_a, n_b : float
|
||||
n_a and n_b are weights for a and b for the merge strategy.
|
||||
They are used for a weighted mean.
|
||||
|
||||
Returns
|
||||
=======
|
||||
out : IntFloatDict object
|
||||
The IntFloatDict resulting from the merge
|
||||
"""
|
||||
cdef IntFloatDict out_obj = IntFloatDict.__new__(IntFloatDict)
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_it = a.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator a_end = a.my_map.end()
|
||||
cdef intp_t key
|
||||
cdef float64_t value
|
||||
cdef float64_t n_out = <float64_t> (n_a + n_b)
|
||||
# First copy a into out
|
||||
while a_it != a_end:
|
||||
key = deref(a_it).first
|
||||
if mask[key]:
|
||||
out_obj.my_map[key] = deref(a_it).second
|
||||
inc(a_it)
|
||||
|
||||
# Then merge b into out
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_it = out_obj.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator out_end = out_obj.my_map.end()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_it = b.my_map.begin()
|
||||
cdef cpp_map[intp_t, float64_t].iterator b_end = b.my_map.end()
|
||||
while b_it != b_end:
|
||||
key = deref(b_it).first
|
||||
value = deref(b_it).second
|
||||
if mask[key]:
|
||||
out_it = out_obj.my_map.find(key)
|
||||
if out_it == out_end:
|
||||
# Key not found
|
||||
out_obj.my_map[key] = value
|
||||
else:
|
||||
deref(out_it).second = (n_a * deref(out_it).second
|
||||
+ n_b * value) / n_out
|
||||
inc(b_it)
|
||||
return out_obj
|
||||
|
||||
|
||||
###############################################################################
|
||||
# An edge object for fast comparisons
|
||||
|
||||
cdef class WeightedEdge:
|
||||
cdef public intp_t a
|
||||
cdef public intp_t b
|
||||
cdef public float64_t weight
|
||||
|
||||
def __init__(self, float64_t weight, intp_t a, intp_t b):
|
||||
self.weight = weight
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
def __richcmp__(self, WeightedEdge other, int op):
|
||||
"""Cython-specific comparison method.
|
||||
|
||||
op is the comparison code::
|
||||
< 0
|
||||
== 2
|
||||
> 4
|
||||
<= 1
|
||||
!= 3
|
||||
>= 5
|
||||
"""
|
||||
if op == 0:
|
||||
return self.weight < other.weight
|
||||
elif op == 1:
|
||||
return self.weight <= other.weight
|
||||
elif op == 2:
|
||||
return self.weight == other.weight
|
||||
elif op == 3:
|
||||
return self.weight != other.weight
|
||||
elif op == 4:
|
||||
return self.weight > other.weight
|
||||
elif op == 5:
|
||||
return self.weight >= other.weight
|
||||
|
||||
def __repr__(self):
|
||||
return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__,
|
||||
self.weight,
|
||||
self.a, self.b)
|
||||
|
||||
|
||||
################################################################################
|
||||
# Efficient labelling/conversion of MSTs to single linkage hierarchies
|
||||
|
||||
cdef class UnionFind(object):
|
||||
|
||||
def __init__(self, N):
|
||||
self.parent = np.full(2 * N - 1, -1., dtype=np.intp, order='C')
|
||||
self.next_label = N
|
||||
self.size = np.hstack((np.ones(N, dtype=np.intp),
|
||||
np.zeros(N - 1, dtype=np.intp)))
|
||||
|
||||
cdef void union(self, intp_t m, intp_t n) noexcept:
|
||||
self.parent[m] = self.next_label
|
||||
self.parent[n] = self.next_label
|
||||
self.size[self.next_label] = self.size[m] + self.size[n]
|
||||
self.next_label += 1
|
||||
return
|
||||
|
||||
@cython.wraparound(True)
|
||||
cdef intp_t fast_find(self, intp_t n) noexcept:
|
||||
cdef intp_t p
|
||||
p = n
|
||||
# find the highest node in the linkage graph so far
|
||||
while self.parent[n] != -1:
|
||||
n = self.parent[n]
|
||||
# provide a shortcut up to the highest node
|
||||
while self.parent[p] != n:
|
||||
p, self.parent[p] = self.parent[p], n
|
||||
return n
|
||||
|
||||
|
||||
def _single_linkage_label(const float64_t[:, :] L):
|
||||
"""
|
||||
Convert an linkage array or MST to a tree by labelling clusters at merges.
|
||||
This is done by using a Union find structure to keep track of merges
|
||||
efficiently. This is the private version of the function that assumes that
|
||||
``L`` has been properly validated. See ``single_linkage_label`` for the
|
||||
user facing version of this function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L: array of shape (n_samples - 1, 3)
|
||||
The linkage array or MST where each row specifies two samples
|
||||
to be merged and a distance or weight at which the merge occurs. This
|
||||
array is assumed to be sorted by the distance/weight.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tree in the format used by scipy.cluster.hierarchy.
|
||||
"""
|
||||
|
||||
cdef float64_t[:, ::1] result_arr
|
||||
|
||||
cdef intp_t left, left_cluster, right, right_cluster, index
|
||||
cdef float64_t delta
|
||||
|
||||
result_arr = np.zeros((L.shape[0], 4), dtype=np.float64)
|
||||
U = UnionFind(L.shape[0] + 1)
|
||||
|
||||
for index in range(L.shape[0]):
|
||||
|
||||
left = <intp_t> L[index, 0]
|
||||
right = <intp_t> L[index, 1]
|
||||
delta = L[index, 2]
|
||||
|
||||
left_cluster = U.fast_find(left)
|
||||
right_cluster = U.fast_find(right)
|
||||
|
||||
result_arr[index][0] = left_cluster
|
||||
result_arr[index][1] = right_cluster
|
||||
result_arr[index][2] = delta
|
||||
result_arr[index][3] = U.size[left_cluster] + U.size[right_cluster]
|
||||
|
||||
U.union(left_cluster, right_cluster)
|
||||
|
||||
return np.asarray(result_arr)
|
||||
|
||||
|
||||
@cython.wraparound(True)
|
||||
def single_linkage_label(L):
|
||||
"""
|
||||
Convert an linkage array or MST to a tree by labelling clusters at merges.
|
||||
This is done by using a Union find structure to keep track of merges
|
||||
efficiently.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L: array of shape (n_samples - 1, 3)
|
||||
The linkage array or MST where each row specifies two samples
|
||||
to be merged and a distance or weight at which the merge occurs. This
|
||||
array is assumed to be sorted by the distance/weight.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A tree in the format used by scipy.cluster.hierarchy.
|
||||
"""
|
||||
# Validate L
|
||||
if L[:, :2].min() < 0 or L[:, :2].max() >= 2 * L.shape[0] + 1:
|
||||
raise ValueError("Input MST array is not a validly formatted MST array")
|
||||
|
||||
is_sorted = lambda x: np.all(x[:-1] <= x[1:])
|
||||
if not is_sorted(L[:, 2]):
|
||||
raise ValueError("Input MST array must be sorted by weight")
|
||||
|
||||
return _single_linkage_label(L)
|
||||
|
||||
|
||||
# Implements MST-LINKAGE-CORE from https://arxiv.org/abs/1109.2378
|
||||
def mst_linkage_core(
|
||||
const float64_t [:, ::1] raw_data,
|
||||
DistanceMetric64 dist_metric):
|
||||
"""
|
||||
Compute the necessary elements of a minimum spanning
|
||||
tree for computation of single linkage clustering. This
|
||||
represents the MST-LINKAGE-CORE algorithm (Figure 6) from
|
||||
:arxiv:`Daniel Mullner, "Modern hierarchical, agglomerative clustering
|
||||
algorithms" <1109.2378>`.
|
||||
|
||||
In contrast to the scipy implementation is never computes
|
||||
a full distance matrix, generating distances only as they
|
||||
are needed and releasing them when no longer needed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
raw_data: array of shape (n_samples, n_features)
|
||||
The array of feature data to be clustered. Must be C-aligned
|
||||
|
||||
dist_metric: DistanceMetric64
|
||||
A DistanceMetric64 object conforming to the API from
|
||||
``sklearn.metrics._dist_metrics.pxd`` that will be
|
||||
used to compute distances.
|
||||
|
||||
Returns
|
||||
-------
|
||||
mst_core_data: array of shape (n_samples, 3)
|
||||
An array providing information from which one
|
||||
can either compute an MST, or the linkage hierarchy
|
||||
very efficiently. See :arxiv:`Daniel Mullner, "Modern hierarchical,
|
||||
agglomerative clustering algorithms" <1109.2378>` algorithm
|
||||
MST-LINKAGE-CORE for more details.
|
||||
"""
|
||||
cdef:
|
||||
intp_t n_samples = raw_data.shape[0]
|
||||
uint8_t[:] in_tree = np.zeros(n_samples, dtype=bool)
|
||||
float64_t[:, ::1] result = np.zeros((n_samples - 1, 3))
|
||||
|
||||
intp_t current_node = 0
|
||||
intp_t new_node
|
||||
intp_t i
|
||||
intp_t j
|
||||
intp_t num_features = raw_data.shape[1]
|
||||
|
||||
float64_t right_value
|
||||
float64_t left_value
|
||||
float64_t new_distance
|
||||
|
||||
float64_t[:] current_distances = np.full(n_samples, INFINITY)
|
||||
|
||||
for i in range(n_samples - 1):
|
||||
|
||||
in_tree[current_node] = 1
|
||||
|
||||
new_distance = INFINITY
|
||||
new_node = 0
|
||||
|
||||
for j in range(n_samples):
|
||||
if in_tree[j]:
|
||||
continue
|
||||
|
||||
right_value = current_distances[j]
|
||||
left_value = dist_metric.dist(&raw_data[current_node, 0],
|
||||
&raw_data[j, 0],
|
||||
num_features)
|
||||
|
||||
if left_value < right_value:
|
||||
current_distances[j] = left_value
|
||||
|
||||
if current_distances[j] < new_distance:
|
||||
new_distance = current_distances[j]
|
||||
new_node = j
|
||||
|
||||
result[i, 0] = current_node
|
||||
result[i, 1] = new_node
|
||||
result[i, 2] = new_distance
|
||||
current_node = new_node
|
||||
|
||||
return np.array(result)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user