library packages

2024-09-28 22:56:00 -07:00
parent 64d9b78b3a
commit 1973934e95
4893 changed files with 1184173 additions and 31 deletions
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/init.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/init.py
@@ -0,0 +1,8 @@
+"""Models based on neural networks."""
+
+# License: BSD 3 clause
+
+from ._multilayer_perceptron import MLPClassifier, MLPRegressor
+from ._rbm import BernoulliRBM
+
+__all__ = ["BernoulliRBM", "MLPClassifier", "MLPRegressor"]
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/init.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/init.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_base.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_base.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_multilayer_perceptron.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_multilayer_perceptron.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_rbm.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_rbm.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_stochastic_optimizers.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/pycache/_stochastic_optimizers.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_base.py
@@ -0,0 +1,235 @@
+"""Utilities for the neural network modules"""
+
+# Author: Issam H. Laradji <issam.laradji@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+from scipy.special import expit as logistic_sigmoid
+from scipy.special import xlogy
+
+
+def inplace_identity(X):
+    """Simply leave the input array unchanged.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Data, where `n_samples` is the number of samples
+        and `n_features` is the number of features.
+    """
+    # Nothing to do
+
+
+def inplace_logistic(X):
+    """Compute the logistic function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    logistic_sigmoid(X, out=X)
+
+
+def inplace_tanh(X):
+    """Compute the hyperbolic tan function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    np.tanh(X, out=X)
+
+
+def inplace_relu(X):
+    """Compute the rectified linear unit function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    np.maximum(X, 0, out=X)
+
+
+def inplace_softmax(X):
+    """Compute the K-way softmax function inplace.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    """
+    tmp = X - X.max(axis=1)[:, np.newaxis]
+    np.exp(tmp, out=X)
+    X /= X.sum(axis=1)[:, np.newaxis]
+
+
+ACTIVATIONS = {
+    "identity": inplace_identity,
+    "tanh": inplace_tanh,
+    "logistic": inplace_logistic,
+    "relu": inplace_relu,
+    "softmax": inplace_softmax,
+}
+
+
+def inplace_identity_derivative(Z, delta):
+    """Apply the derivative of the identity function: do nothing.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the identity activation function during
+        the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    # Nothing to do
+
+
+def inplace_logistic_derivative(Z, delta):
+    """Apply the derivative of the logistic sigmoid function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from logistic function.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the logistic activation function during
+        the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= Z
+    delta *= 1 - Z
+
+
+def inplace_tanh_derivative(Z, delta):
+    """Apply the derivative of the hyperbolic tanh function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from hyperbolic tangent.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the hyperbolic tangent activation
+        function during the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= 1 - Z**2
+
+
+def inplace_relu_derivative(Z, delta):
+    """Apply the derivative of the relu function.
+
+    It exploits the fact that the derivative is a simple function of the output
+    value from rectified linear units activation function.
+
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
+
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta[Z == 0] = 0
+
+
+DERIVATIVES = {
+    "identity": inplace_identity_derivative,
+    "tanh": inplace_tanh_derivative,
+    "logistic": inplace_logistic_derivative,
+    "relu": inplace_relu_derivative,
+}
+
+
+def squared_loss(y_true, y_pred):
+    """Compute the squared loss for regression.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) values.
+
+    y_pred : array-like or label indicator matrix
+        Predicted values, as returned by a regression estimator.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    return ((y_true - y_pred) ** 2).mean() / 2
+
+
+def log_loss(y_true, y_prob):
+    """Compute Logistic loss for classification.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+
+    y_prob : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    if y_prob.shape[1] == 1:
+        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+
+    if y_true.shape[1] == 1:
+        y_true = np.append(1 - y_true, y_true, axis=1)
+
+    return -xlogy(y_true, y_prob).sum() / y_prob.shape[0]
+
+
+def binary_log_loss(y_true, y_prob):
+    """Compute binary logistic loss for classification.
+
+    This is identical to log_loss in binary classification case,
+    but is kept for its use in multilabel case.
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+
+    y_prob : array-like of float, shape = (n_samples, 1)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    return (
+        -(xlogy(y_true, y_prob).sum() + xlogy(1 - y_true, 1 - y_prob).sum())
+        / y_prob.shape[0]
+    )
+
+
+LOSS_FUNCTIONS = {
+    "squared_error": squared_loss,
+    "log_loss": log_loss,
+    "binary_log_loss": binary_log_loss,
+}
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_multilayer_perceptron.py
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_rbm.py
@@ -0,0 +1,455 @@
+"""Restricted Boltzmann Machine"""
+
+# Authors: Yann N. Dauphin <dauphiya@iro.umontreal.ca>
+#          Vlad Niculae
+#          Gabriel Synnaeve
+#          Lars Buitinck
+# License: BSD 3 clause
+
+import time
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from scipy.special import expit  # logistic function
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import check_random_state, gen_even_slices
+from ..utils._param_validation import Interval
+from ..utils.extmath import safe_sparse_dot
+from ..utils.validation import check_is_fitted
+
+
+class BernoulliRBM(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
+    """Bernoulli Restricted Boltzmann Machine (RBM).
+
+    A Restricted Boltzmann Machine with binary visible units and
+    binary hidden units. Parameters are estimated using Stochastic Maximum
+    Likelihood (SML), also known as Persistent Contrastive Divergence (PCD)
+    [2].
+
+    The time complexity of this implementation is ``O(d ** 2)`` assuming
+    d ~ n_features ~ n_components.
+
+    Read more in the :ref:`User Guide <rbm>`.
+
+    Parameters
+    ----------
+    n_components : int, default=256
+        Number of binary hidden units.
+
+    learning_rate : float, default=0.1
+        The learning rate for weight updates. It is *highly* recommended
+        to tune this hyper-parameter. Reasonable values are in the
+        10**[0., -3.] range.
+
+    batch_size : int, default=10
+        Number of examples per minibatch.
+
+    n_iter : int, default=10
+        Number of iterations/sweeps over the training dataset to perform
+        during training.
+
+    verbose : int, default=0
+        The verbosity level. The default, zero, means silent mode. Range
+        of values is [0, inf].
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for:
+
+        - Gibbs sampling from visible and hidden layers.
+
+        - Initializing components, sampling from layers during fit.
+
+        - Corrupting the data when scoring samples.
+
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    intercept_hidden_ : array-like of shape (n_components,)
+        Biases of the hidden units.
+
+    intercept_visible_ : array-like of shape (n_features,)
+        Biases of the visible units.
+
+    components_ : array-like of shape (n_components, n_features)
+        Weight matrix, where `n_features` is the number of
+        visible units and `n_components` is the number of hidden units.
+
+    h_samples_ : array-like of shape (batch_size, n_components)
+        Hidden Activation sampled from the model distribution,
+        where `batch_size` is the number of examples per minibatch and
+        `n_components` is the number of hidden units.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    See Also
+    --------
+    sklearn.neural_network.MLPRegressor : Multi-layer Perceptron regressor.
+    sklearn.neural_network.MLPClassifier : Multi-layer Perceptron classifier.
+    sklearn.decomposition.PCA : An unsupervised linear dimensionality
+        reduction model.
+
+    References
+    ----------
+
+    [1] Hinton, G. E., Osindero, S. and Teh, Y. A fast learning algorithm for
+        deep belief nets. Neural Computation 18, pp 1527-1554.
+        https://www.cs.toronto.edu/~hinton/absps/fastnc.pdf
+
+    [2] Tieleman, T. Training Restricted Boltzmann Machines using
+        Approximations to the Likelihood Gradient. International Conference
+        on Machine Learning (ICML) 2008
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> from sklearn.neural_network import BernoulliRBM
+    >>> X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
+    >>> model = BernoulliRBM(n_components=2)
+    >>> model.fit(X)
+    BernoulliRBM(n_components=2)
+
+    For a more detailed example usage, see
+    :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py`.
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 1, None, closed="left")],
+        "learning_rate": [Interval(Real, 0, None, closed="neither")],
+        "batch_size": [Interval(Integral, 1, None, closed="left")],
+        "n_iter": [Interval(Integral, 0, None, closed="left")],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=256,
+        *,
+        learning_rate=0.1,
+        batch_size=10,
+        n_iter=10,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.learning_rate = learning_rate
+        self.batch_size = batch_size
+        self.n_iter = n_iter
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def transform(self, X):
+        """Compute the hidden layer activation probabilities, P(h=1|v=X).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data to be transformed.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Latent representations of the data.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_data(
+            X, accept_sparse="csr", reset=False, dtype=(np.float64, np.float32)
+        )
+        return self._mean_hiddens(X)
+
+    def _mean_hiddens(self, v):
+        """Computes the probabilities P(h=1|v).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Corresponding mean field values for the hidden layer.
+        """
+        p = safe_sparse_dot(v, self.components_.T)
+        p += self.intercept_hidden_
+        return expit(p, out=p)
+
+    def _sample_hiddens(self, v, rng):
+        """Sample from the distribution P(h|v).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer to sample from.
+
+        rng : RandomState instance
+            Random number generator to use.
+
+        Returns
+        -------
+        h : ndarray of shape (n_samples, n_components)
+            Values of the hidden layer.
+        """
+        p = self._mean_hiddens(v)
+        return rng.uniform(size=p.shape) < p
+
+    def _sample_visibles(self, h, rng):
+        """Sample from the distribution P(v|h).
+
+        Parameters
+        ----------
+        h : ndarray of shape (n_samples, n_components)
+            Values of the hidden layer to sample from.
+
+        rng : RandomState instance
+            Random number generator to use.
+
+        Returns
+        -------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+        """
+        p = np.dot(h, self.components_)
+        p += self.intercept_visible_
+        expit(p, out=p)
+        return rng.uniform(size=p.shape) < p
+
+    def _free_energy(self, v):
+        """Computes the free energy F(v) = - log sum_h exp(-E(v,h)).
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer.
+
+        Returns
+        -------
+        free_energy : ndarray of shape (n_samples,)
+            The value of the free energy.
+        """
+        return -safe_sparse_dot(v, self.intercept_visible_) - np.logaddexp(
+            0, safe_sparse_dot(v, self.components_.T) + self.intercept_hidden_
+        ).sum(axis=1)
+
+    def gibbs(self, v):
+        """Perform one Gibbs sampling step.
+
+        Parameters
+        ----------
+        v : ndarray of shape (n_samples, n_features)
+            Values of the visible layer to start from.
+
+        Returns
+        -------
+        v_new : ndarray of shape (n_samples, n_features)
+            Values of the visible layer after one Gibbs step.
+        """
+        check_is_fitted(self)
+        if not hasattr(self, "random_state_"):
+            self.random_state_ = check_random_state(self.random_state)
+        h_ = self._sample_hiddens(v, self.random_state_)
+        v_ = self._sample_visibles(h_, self.random_state_)
+
+        return v_
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Fit the model to the partial segment of the data X.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+            Target values (None for unsupervised transformations).
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        first_pass = not hasattr(self, "components_")
+        X = self._validate_data(
+            X, accept_sparse="csr", dtype=np.float64, reset=first_pass
+        )
+        if not hasattr(self, "random_state_"):
+            self.random_state_ = check_random_state(self.random_state)
+        if not hasattr(self, "components_"):
+            self.components_ = np.asarray(
+                self.random_state_.normal(0, 0.01, (self.n_components, X.shape[1])),
+                order="F",
+            )
+            self._n_features_out = self.components_.shape[0]
+        if not hasattr(self, "intercept_hidden_"):
+            self.intercept_hidden_ = np.zeros(
+                self.n_components,
+            )
+        if not hasattr(self, "intercept_visible_"):
+            self.intercept_visible_ = np.zeros(
+                X.shape[1],
+            )
+        if not hasattr(self, "h_samples_"):
+            self.h_samples_ = np.zeros((self.batch_size, self.n_components))
+
+        self._fit(X, self.random_state_)
+
+    def _fit(self, v_pos, rng):
+        """Inner fit for one mini-batch.
+
+        Adjust the parameters to maximize the likelihood of v using
+        Stochastic Maximum Likelihood (SML).
+
+        Parameters
+        ----------
+        v_pos : ndarray of shape (n_samples, n_features)
+            The data to use for training.
+
+        rng : RandomState instance
+            Random number generator to use for sampling.
+        """
+        h_pos = self._mean_hiddens(v_pos)
+        v_neg = self._sample_visibles(self.h_samples_, rng)
+        h_neg = self._mean_hiddens(v_neg)
+
+        lr = float(self.learning_rate) / v_pos.shape[0]
+        update = safe_sparse_dot(v_pos.T, h_pos, dense_output=True).T
+        update -= np.dot(h_neg.T, v_neg)
+        self.components_ += lr * update
+        self.intercept_hidden_ += lr * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
+        self.intercept_visible_ += lr * (
+            np.asarray(v_pos.sum(axis=0)).squeeze() - v_neg.sum(axis=0)
+        )
+
+        h_neg[rng.uniform(size=h_neg.shape) < h_neg] = 1.0  # sample binomial
+        self.h_samples_ = np.floor(h_neg, h_neg)
+
+    def score_samples(self, X):
+        """Compute the pseudo-likelihood of X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Values of the visible layer. Must be all-boolean (not checked).
+
+        Returns
+        -------
+        pseudo_likelihood : ndarray of shape (n_samples,)
+            Value of the pseudo-likelihood (proxy for likelihood).
+
+        Notes
+        -----
+        This method is not deterministic: it computes a quantity called the
+        free energy on X, then on a randomly corrupted version of X, and
+        returns the log of the logistic function of the difference.
+        """
+        check_is_fitted(self)
+
+        v = self._validate_data(X, accept_sparse="csr", reset=False)
+        rng = check_random_state(self.random_state)
+
+        # Randomly corrupt one feature in each sample in v.
+        ind = (np.arange(v.shape[0]), rng.randint(0, v.shape[1], v.shape[0]))
+        if sp.issparse(v):
+            data = -2 * v[ind] + 1
+            if isinstance(data, np.matrix):  # v is a sparse matrix
+                v_ = v + sp.csr_matrix((data.A.ravel(), ind), shape=v.shape)
+            else:  # v is a sparse array
+                v_ = v + sp.csr_array((data.ravel(), ind), shape=v.shape)
+        else:
+            v_ = v.copy()
+            v_[ind] = 1 - v_[ind]
+
+        fe = self._free_energy(v)
+        fe_ = self._free_energy(v_)
+        # log(expit(x)) = log(1 / (1 + exp(-x)) = -np.logaddexp(0, -x)
+        return -v.shape[1] * np.logaddexp(0, -(fe_ - fe))
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Fit the model to the data X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+            Target values (None for unsupervised transformations).
+
+        Returns
+        -------
+        self : BernoulliRBM
+            The fitted model.
+        """
+        X = self._validate_data(X, accept_sparse="csr", dtype=(np.float64, np.float32))
+        n_samples = X.shape[0]
+        rng = check_random_state(self.random_state)
+
+        self.components_ = np.asarray(
+            rng.normal(0, 0.01, (self.n_components, X.shape[1])),
+            order="F",
+            dtype=X.dtype,
+        )
+        self._n_features_out = self.components_.shape[0]
+        self.intercept_hidden_ = np.zeros(self.n_components, dtype=X.dtype)
+        self.intercept_visible_ = np.zeros(X.shape[1], dtype=X.dtype)
+        self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=X.dtype)
+
+        n_batches = int(np.ceil(float(n_samples) / self.batch_size))
+        batch_slices = list(
+            gen_even_slices(n_batches * self.batch_size, n_batches, n_samples=n_samples)
+        )
+        verbose = self.verbose
+        begin = time.time()
+        for iteration in range(1, self.n_iter + 1):
+            for batch_slice in batch_slices:
+                self._fit(X[batch_slice], rng)
+
+            if verbose:
+                end = time.time()
+                print(
+                    "[%s] Iteration %d, pseudo-likelihood = %.2f, time = %.2fs"
+                    % (
+                        type(self).__name__,
+                        iteration,
+                        self.score_samples(X).mean(),
+                        end - begin,
+                    )
+                )
+                begin = end
+
+        return self
+
+    def _more_tags(self):
+        return {
+            "_xfail_checks": {
+                "check_methods_subset_invariance": (
+                    "fails for the decision_function method"
+                ),
+                "check_methods_sample_order_invariance": (
+                    "fails for the score_samples method"
+                ),
+            },
+            "preserves_dtype": [np.float64, np.float32],
+        }
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/_stochastic_optimizers.py
@@ -0,0 +1,287 @@
+"""Stochastic optimization methods for MLP"""
+
+# Authors: Jiyuan Qian <jq401@nyu.edu>
+# License: BSD 3 clause
+
+import numpy as np
+
+
+class BaseOptimizer:
+    """Base (Stochastic) gradient descent optimizer
+
+    Parameters
+    ----------
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+    """
+
+    def __init__(self, learning_rate_init=0.1):
+        self.learning_rate_init = learning_rate_init
+        self.learning_rate = float(learning_rate_init)
+
+    def update_params(self, params, grads):
+        """Update parameters with given gradients
+
+        Parameters
+        ----------
+        params : list of length = len(coefs_) + len(intercepts_)
+            The concatenated list containing coefs_ and intercepts_ in MLP
+            model. Used for initializing velocities and updating params
+
+        grads : list of length = len(params)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        """
+        updates = self._get_updates(grads)
+        for param, update in zip((p for p in params), updates):
+            param += update
+
+    def iteration_ends(self, time_step):
+        """Perform update to learning rate and potentially other states at the
+        end of an iteration
+        """
+        pass
+
+    def trigger_stopping(self, msg, verbose):
+        """Decides whether it is time to stop training
+
+        Parameters
+        ----------
+        msg : str
+            Message passed in for verbose output
+
+        verbose : bool
+            Print message to stdin if True
+
+        Returns
+        -------
+        is_stopping : bool
+            True if training needs to stop
+        """
+        if verbose:
+            print(msg + " Stopping.")
+        return True
+
+
+class SGDOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with momentum
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
+        Learning rate schedule for weight updates.
+
+        -'constant', is a constant learning rate given by
+         'learning_rate_init'.
+
+        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+          each time step 't' using an inverse scaling exponent of 'power_t'.
+          learning_rate_ = learning_rate_init / pow(t, power_t)
+
+        -'adaptive', keeps the learning rate constant to
+         'learning_rate_init' as long as the training keeps decreasing.
+         Each time 2 consecutive epochs fail to decrease the training loss by
+         tol, or fail to increase validation score by tol if 'early_stopping'
+         is on, the current learning rate is divided by 5.
+
+    momentum : float, default=0.9
+        Value of momentum used, must be larger than or equal to 0
+
+    nesterov : bool, default=True
+        Whether to use nesterov's momentum or not. Use nesterov's if True
+
+    power_t : float, default=0.5
+        Power of time step 't' in inverse scaling. See `lr_schedule` for
+        more details.
+
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+
+    velocities : list, length = len(params)
+        velocities that are used to update params
+    """
+
+    def __init__(
+        self,
+        params,
+        learning_rate_init=0.1,
+        lr_schedule="constant",
+        momentum=0.9,
+        nesterov=True,
+        power_t=0.5,
+    ):
+        super().__init__(learning_rate_init)
+
+        self.lr_schedule = lr_schedule
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.power_t = power_t
+        self.velocities = [np.zeros_like(param) for param in params]
+
+    def iteration_ends(self, time_step):
+        """Perform updates to learning rate and potential other states at the
+        end of an iteration
+
+        Parameters
+        ----------
+        time_step : int
+            number of training samples trained on so far, used to update
+            learning rate for 'invscaling'
+        """
+        if self.lr_schedule == "invscaling":
+            self.learning_rate = (
+                float(self.learning_rate_init) / (time_step + 1) ** self.power_t
+            )
+
+    def trigger_stopping(self, msg, verbose):
+        if self.lr_schedule != "adaptive":
+            if verbose:
+                print(msg + " Stopping.")
+            return True
+
+        if self.learning_rate <= 1e-6:
+            if verbose:
+                print(msg + " Learning rate too small. Stopping.")
+            return True
+
+        self.learning_rate /= 5.0
+        if verbose:
+            print(msg + " Setting learning rate to %f" % self.learning_rate)
+        return False
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        updates = [
+            self.momentum * velocity - self.learning_rate * grad
+            for velocity, grad in zip(self.velocities, grads)
+        ]
+        self.velocities = updates
+
+        if self.nesterov:
+            updates = [
+                self.momentum * velocity - self.learning_rate * grad
+                for velocity, grad in zip(self.velocities, grads)
+            ]
+
+        return updates
+
+
+class AdamOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with Adam
+
+    Note: All default values are from the original Adam paper
+
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector, should be
+        in [0, 1)
+
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector, should be
+        in [0, 1)
+
+    epsilon : float, default=1e-8
+        Value for numerical stability
+
+    Attributes
+    ----------
+    learning_rate : float
+        The current learning rate
+
+    t : int
+        Timestep
+
+    ms : list, length = len(params)
+        First moment vectors
+
+    vs : list, length = len(params)
+        Second moment vectors
+
+    References
+    ----------
+    :arxiv:`Kingma, Diederik, and Jimmy Ba (2014) "Adam: A method for
+        stochastic optimization." <1412.6980>
+    """
+
+    def __init__(
+        self, params, learning_rate_init=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8
+    ):
+        super().__init__(learning_rate_init)
+
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.t = 0
+        self.ms = [np.zeros_like(param) for param in params]
+        self.vs = [np.zeros_like(param) for param in params]
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        self.t += 1
+        self.ms = [
+            self.beta_1 * m + (1 - self.beta_1) * grad
+            for m, grad in zip(self.ms, grads)
+        ]
+        self.vs = [
+            self.beta_2 * v + (1 - self.beta_2) * (grad**2)
+            for v, grad in zip(self.vs, grads)
+        ]
+        self.learning_rate = (
+            self.learning_rate_init
+            * np.sqrt(1 - self.beta_2**self.t)
+            / (1 - self.beta_1**self.t)
+        )
+        updates = [
+            -self.learning_rate * m / (np.sqrt(v) + self.epsilon)
+            for m, v in zip(self.ms, self.vs)
+        ]
+        return updates
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/init.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/init.py
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/init.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/init.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_base.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_base.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_mlp.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_mlp.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_rbm.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_rbm.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_stochastic_optimizers.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/pycache/test_stochastic_optimizers.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_base.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pytest
+
+from sklearn.neural_network._base import binary_log_loss, log_loss
+
+
+def test_binary_log_loss_1_prob_finite():
+    # y_proba is equal to one should result in a finite logloss
+    y_true = np.array([[0, 0, 1]]).T
+    y_prob = np.array([[0.9, 1.0, 1.0]]).T
+
+    loss = binary_log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
+
+
+@pytest.mark.parametrize(
+    "y_true, y_prob",
+    [
+        (
+            np.array([[1, 0, 0], [0, 1, 0]]),
+            np.array([[0.0, 1.0, 0.0], [0.9, 0.05, 0.05]]),
+        ),
+        (np.array([[0, 0, 1]]).T, np.array([[0.9, 1.0, 1.0]]).T),
+    ],
+)
+def test_log_loss_1_prob_finite(y_true, y_prob):
+    # y_proba is equal to 1 should result in a finite logloss
+    loss = log_loss(y_true, y_prob)
+    assert np.isfinite(loss)
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_mlp.py
@@ -0,0 +1,968 @@
+"""
+Testing for Multi-layer Perceptron module (sklearn.neural_network)
+"""
+
+# Author: Issam H. Laradji
+# License: BSD 3 clause
+
+import re
+import sys
+import warnings
+from io import StringIO
+
+import joblib
+import numpy as np
+import pytest
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+)
+
+from sklearn.datasets import (
+    load_digits,
+    load_iris,
+    make_multilabel_classification,
+    make_regression,
+)
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.metrics import roc_auc_score
+from sklearn.neural_network import MLPClassifier, MLPRegressor
+from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, scale
+from sklearn.utils._testing import ignore_warnings
+from sklearn.utils.fixes import CSR_CONTAINERS
+
+ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"]
+
+X_digits, y_digits = load_digits(n_class=3, return_X_y=True)
+
+X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_multi = y_digits[:200]
+
+X_digits, y_digits = load_digits(n_class=2, return_X_y=True)
+
+X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200])
+y_digits_binary = y_digits[:200]
+
+classification_datasets = [
+    (X_digits_multi, y_digits_multi),
+    (X_digits_binary, y_digits_binary),
+]
+
+X_reg, y_reg = make_regression(
+    n_samples=200, n_features=10, bias=20.0, noise=100.0, random_state=7
+)
+y_reg = scale(y_reg)
+regression_datasets = [(X_reg, y_reg)]
+
+iris = load_iris()
+
+X_iris = iris.data
+y_iris = iris.target
+
+
+def test_alpha():
+    # Test that larger alpha yields weights closer to zero
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    alpha_vectors = []
+    alpha_values = np.arange(2)
+    absolute_sum = lambda x: np.sum(np.abs(x))
+
+    for alpha in alpha_values:
+        mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1)
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        alpha_vectors.append(
+            np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])])
+        )
+
+    for i in range(len(alpha_values) - 1):
+        assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
+
+
+def test_fit():
+    # Test that the algorithm solution is equal to a worked out example.
+    X = np.array([[0.6, 0.8, 0.7]])
+    y = np.array([0])
+    mlp = MLPClassifier(
+        solver="sgd",
+        learning_rate_init=0.1,
+        alpha=0.1,
+        activation="logistic",
+        random_state=1,
+        max_iter=1,
+        hidden_layer_sizes=2,
+        momentum=0,
+    )
+    # set weights
+    mlp.coefs_ = [0] * 2
+    mlp.intercepts_ = [0] * 2
+    mlp.n_outputs_ = 1
+    mlp.coefs_[0] = np.array([[0.1, 0.2], [0.3, 0.1], [0.5, 0]])
+    mlp.coefs_[1] = np.array([[0.1], [0.2]])
+    mlp.intercepts_[0] = np.array([0.1, 0.1])
+    mlp.intercepts_[1] = np.array([1.0])
+    mlp._coef_grads = [] * 2
+    mlp._intercept_grads = [] * 2
+    mlp.n_features_in_ = 3
+
+    # Initialize parameters
+    mlp.n_iter_ = 0
+    mlp.learning_rate_ = 0.1
+
+    # Compute the number of layers
+    mlp.n_layers_ = 3
+
+    # Pre-allocate gradient matrices
+    mlp._coef_grads = [0] * (mlp.n_layers_ - 1)
+    mlp._intercept_grads = [0] * (mlp.n_layers_ - 1)
+
+    mlp.out_activation_ = "logistic"
+    mlp.t_ = 0
+    mlp.best_loss_ = np.inf
+    mlp.loss_curve_ = []
+    mlp._no_improvement_count = 0
+    mlp._intercept_velocity = [
+        np.zeros_like(intercepts) for intercepts in mlp.intercepts_
+    ]
+    mlp._coef_velocity = [np.zeros_like(coefs) for coefs in mlp.coefs_]
+
+    mlp.partial_fit(X, y, classes=[0, 1])
+    # Manually worked out example
+    # h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.1 + 0.8 * 0.3 + 0.7 * 0.5 + 0.1)
+    #       =  0.679178699175393
+    # h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.2 + 0.8 * 0.1 + 0.7 * 0 + 0.1)
+    #         = 0.574442516811659
+    # o1 = g(h * W2 + b21) = g(0.679 * 0.1 + 0.574 * 0.2 + 1)
+    #       = 0.7654329236196236
+    # d21 = -(0 - 0.765) = 0.765
+    # d11 = (1 - 0.679) * 0.679 * 0.765 * 0.1 = 0.01667
+    # d12 = (1 - 0.574) * 0.574 * 0.765 * 0.2 = 0.0374
+    # W1grad11 = X1 * d11 + alpha * W11 = 0.6 * 0.01667 + 0.1 * 0.1 = 0.0200
+    # W1grad11 = X1 * d12 + alpha * W12 = 0.6 * 0.0374 + 0.1 * 0.2 = 0.04244
+    # W1grad21 = X2 * d11 + alpha * W13 = 0.8 * 0.01667 + 0.1 * 0.3 = 0.043336
+    # W1grad22 = X2 * d12 + alpha * W14 = 0.8 * 0.0374 + 0.1 * 0.1 = 0.03992
+    # W1grad31 = X3 * d11 + alpha * W15 = 0.6 * 0.01667 + 0.1 * 0.5 = 0.060002
+    # W1grad32 = X3 * d12 + alpha * W16 = 0.6 * 0.0374 + 0.1 * 0 = 0.02244
+    # W2grad1 = h1 * d21 + alpha * W21 = 0.679 * 0.765 + 0.1 * 0.1 = 0.5294
+    # W2grad2 = h2 * d21 + alpha * W22 = 0.574 * 0.765 + 0.1 * 0.2 = 0.45911
+    # b1grad1 = d11 = 0.01667
+    # b1grad2 = d12 = 0.0374
+    # b2grad = d21 = 0.765
+    # W1 = W1 - eta * [W1grad11, .., W1grad32] = [[0.1, 0.2], [0.3, 0.1],
+    #          [0.5, 0]] - 0.1 * [[0.0200, 0.04244], [0.043336, 0.03992],
+    #          [0.060002, 0.02244]] = [[0.098, 0.195756], [0.2956664,
+    #          0.096008], [0.4939998, -0.002244]]
+    # W2 = W2 - eta * [W2grad1, W2grad2] = [[0.1], [0.2]] - 0.1 *
+    #        [[0.5294], [0.45911]] = [[0.04706], [0.154089]]
+    # b1 = b1 - eta * [b1grad1, b1grad2] = 0.1 - 0.1 * [0.01667, 0.0374]
+    #         = [0.098333, 0.09626]
+    # b2 = b2 - eta * b2grad = 1.0 - 0.1 * 0.765 = 0.9235
+    assert_almost_equal(
+        mlp.coefs_[0],
+        np.array([[0.098, 0.195756], [0.2956664, 0.096008], [0.4939998, -0.002244]]),
+        decimal=3,
+    )
+    assert_almost_equal(mlp.coefs_[1], np.array([[0.04706], [0.154089]]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[0], np.array([0.098333, 0.09626]), decimal=3)
+    assert_almost_equal(mlp.intercepts_[1], np.array(0.9235), decimal=3)
+    # Testing output
+    #  h1 = g(X1 * W_i1 + b11) = g(0.6 * 0.098 + 0.8 * 0.2956664 +
+    #               0.7 * 0.4939998 + 0.098333) = 0.677
+    #  h2 = g(X2 * W_i2 + b12) = g(0.6 * 0.195756 + 0.8 * 0.096008 +
+    #            0.7 * -0.002244 + 0.09626) = 0.572
+    #  o1 = h * W2 + b21 = 0.677 * 0.04706 +
+    #             0.572 * 0.154089 + 0.9235 = 1.043
+    #  prob = sigmoid(o1) = 0.739
+    assert_almost_equal(mlp.predict_proba(X)[0, 1], 0.739, decimal=3)
+
+
+def test_gradient():
+    # Test gradient.
+
+    # This makes sure that the activation functions and their derivatives
+    # are correct. The numerical and analytical computation of the gradient
+    # should be close.
+    for n_labels in [2, 3]:
+        n_samples = 5
+        n_features = 10
+        random_state = np.random.RandomState(seed=42)
+        X = random_state.rand(n_samples, n_features)
+        y = 1 + np.mod(np.arange(n_samples) + 1, n_labels)
+        Y = LabelBinarizer().fit_transform(y)
+
+        for activation in ACTIVATION_TYPES:
+            mlp = MLPClassifier(
+                activation=activation,
+                hidden_layer_sizes=10,
+                solver="lbfgs",
+                alpha=1e-5,
+                learning_rate_init=0.2,
+                max_iter=1,
+                random_state=1,
+            )
+            mlp.fit(X, y)
+
+            theta = np.hstack([l.ravel() for l in mlp.coefs_ + mlp.intercepts_])
+
+            layer_units = [X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]
+
+            activations = []
+            deltas = []
+            coef_grads = []
+            intercept_grads = []
+
+            activations.append(X)
+            for i in range(mlp.n_layers_ - 1):
+                activations.append(np.empty((X.shape[0], layer_units[i + 1])))
+                deltas.append(np.empty((X.shape[0], layer_units[i + 1])))
+
+                fan_in = layer_units[i]
+                fan_out = layer_units[i + 1]
+                coef_grads.append(np.empty((fan_in, fan_out)))
+                intercept_grads.append(np.empty(fan_out))
+
+            # analytically compute the gradients
+            def loss_grad_fun(t):
+                return mlp._loss_grad_lbfgs(
+                    t, X, Y, activations, deltas, coef_grads, intercept_grads
+                )
+
+            [value, grad] = loss_grad_fun(theta)
+            numgrad = np.zeros(np.size(theta))
+            n = np.size(theta, 0)
+            E = np.eye(n)
+            epsilon = 1e-5
+            # numerically compute the gradients
+            for i in range(n):
+                dtheta = E[:, i] * epsilon
+                numgrad[i] = (
+                    loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]
+                ) / (epsilon * 2.0)
+            assert_almost_equal(numgrad, grad)
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification(X, y):
+    # Test lbfgs on classification.
+    # It should achieve a score higher than 0.95 for the binary and multi-class
+    # versions of the digits dataset.
+    X_train = X[:150]
+    y_train = y[:150]
+    X_test = X[150:]
+    expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind)
+
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X_train, y_train)
+        y_predict = mlp.predict(X_test)
+        assert mlp.score(X_train, y_train) > 0.95
+        assert (y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression(X, y):
+    # Test lbfgs on the regression dataset.
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        mlp.fit(X, y)
+        if activation == "identity":
+            assert mlp.score(X, y) > 0.80
+        else:
+            # Non linear models perform much better than linear bottleneck:
+            assert mlp.score(X, y) > 0.98
+
+
+@pytest.mark.parametrize("X,y", classification_datasets)
+def test_lbfgs_classification_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # classification tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPClassifier(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+@pytest.mark.parametrize("X,y", regression_datasets)
+def test_lbfgs_regression_maxfun(X, y):
+    # Test lbfgs parameter max_fun.
+    # It should independently limit the number of iterations for lbfgs.
+    max_fun = 10
+    # regression tests
+    for activation in ACTIVATION_TYPES:
+        mlp = MLPRegressor(
+            solver="lbfgs",
+            hidden_layer_sizes=50,
+            tol=0.0,
+            max_iter=150,
+            max_fun=max_fun,
+            shuffle=True,
+            random_state=1,
+            activation=activation,
+        )
+        with pytest.warns(ConvergenceWarning):
+            mlp.fit(X, y)
+            assert max_fun >= mlp.n_iter_
+
+
+def test_learning_rate_warmstart():
+    # Tests that warm_start reuse past solutions.
+    X = [[3, 2], [1, 6], [5, 6], [-2, -4]]
+    y = [1, 1, 1, 0]
+    for learning_rate in ["invscaling", "constant"]:
+        mlp = MLPClassifier(
+            solver="sgd",
+            hidden_layer_sizes=4,
+            learning_rate=learning_rate,
+            max_iter=1,
+            power_t=0.25,
+            warm_start=True,
+        )
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+            prev_eta = mlp._optimizer.learning_rate
+            mlp.fit(X, y)
+            post_eta = mlp._optimizer.learning_rate
+
+        if learning_rate == "constant":
+            assert prev_eta == post_eta
+        elif learning_rate == "invscaling":
+            assert mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta
+
+
+def test_multilabel_classification():
+    # Test that multi-label classification works as expected.
+    # test fit method
+    X, y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    mlp = MLPClassifier(
+        solver="lbfgs",
+        hidden_layer_sizes=50,
+        alpha=1e-5,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        learning_rate_init=0.2,
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.97
+
+    # test partial fit method
+    mlp = MLPClassifier(
+        solver="sgd",
+        hidden_layer_sizes=50,
+        max_iter=150,
+        random_state=0,
+        activation="logistic",
+        alpha=1e-5,
+        learning_rate_init=0.2,
+    )
+    for i in range(100):
+        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
+    assert mlp.score(X, y) > 0.9
+
+    # Make sure early stopping still work now that splitting is stratified by
+    # default (it is disabled for multilabel classification)
+    mlp = MLPClassifier(early_stopping=True)
+    mlp.fit(X, y).predict(X)
+
+
+def test_multioutput_regression():
+    # Test that multi-output regression works as expected
+    X, y = make_regression(n_samples=200, n_targets=5)
+    mlp = MLPRegressor(
+        solver="lbfgs", hidden_layer_sizes=50, max_iter=200, random_state=1
+    )
+    mlp.fit(X, y)
+    assert mlp.score(X, y) > 0.9
+
+
+def test_partial_fit_classes_error():
+    # Tests that passing different classes to partial_fit raises an error
+    X = [[3, 2]]
+    y = [0]
+    clf = MLPClassifier(solver="sgd")
+    clf.partial_fit(X, y, classes=[0, 1])
+    with pytest.raises(ValueError):
+        clf.partial_fit(X, y, classes=[1, 2])
+
+
+def test_partial_fit_classification():
+    # Test partial_fit on classification.
+    # `partial_fit` should yield the same results as 'fit' for binary and
+    # multi-class classification.
+    for X, y in classification_datasets:
+        mlp = MLPClassifier(
+            solver="sgd",
+            max_iter=100,
+            random_state=1,
+            tol=0,
+            alpha=1e-5,
+            learning_rate_init=0.2,
+        )
+
+        with ignore_warnings(category=ConvergenceWarning):
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPClassifier(
+            solver="sgd", random_state=1, alpha=1e-5, learning_rate_init=0.2
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y, classes=np.unique(y))
+        pred2 = mlp.predict(X)
+        assert_array_equal(pred1, pred2)
+        assert mlp.score(X, y) > 0.95
+
+
+def test_partial_fit_unseen_classes():
+    # Non regression test for bug 6994
+    # Tests for labeling errors in partial fit
+
+    clf = MLPClassifier(random_state=0)
+    clf.partial_fit([[1], [2], [3]], ["a", "b", "c"], classes=["a", "b", "c", "d"])
+    clf.partial_fit([[4]], ["d"])
+    assert clf.score([[1], [2], [3], [4]], ["a", "b", "c", "d"]) > 0
+
+
+def test_partial_fit_regression():
+    # Test partial_fit on regression.
+    # `partial_fit` should yield the same results as 'fit' for regression.
+    X = X_reg
+    y = y_reg
+
+    for momentum in [0, 0.9]:
+        mlp = MLPRegressor(
+            solver="sgd",
+            max_iter=100,
+            activation="relu",
+            random_state=1,
+            learning_rate_init=0.01,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        with warnings.catch_warnings(record=True):
+            # catch convergence warning
+            mlp.fit(X, y)
+        pred1 = mlp.predict(X)
+        mlp = MLPRegressor(
+            solver="sgd",
+            activation="relu",
+            learning_rate_init=0.01,
+            random_state=1,
+            batch_size=X.shape[0],
+            momentum=momentum,
+        )
+        for i in range(100):
+            mlp.partial_fit(X, y)
+
+        pred2 = mlp.predict(X)
+        assert_allclose(pred1, pred2)
+        score = mlp.score(X, y)
+        assert score > 0.65
+
+
+def test_partial_fit_errors():
+    # Test partial_fit error handling.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+
+    # no classes passed
+    with pytest.raises(ValueError):
+        MLPClassifier(solver="sgd").partial_fit(X, y, classes=[2])
+
+    # lbfgs doesn't support partial_fit
+    assert not hasattr(MLPClassifier(solver="lbfgs"), "partial_fit")
+
+
+def test_nonfinite_params():
+    # Check that MLPRegressor throws ValueError when dealing with non-finite
+    # parameter values
+    rng = np.random.RandomState(0)
+    n_samples = 10
+    fmax = np.finfo(np.float64).max
+    X = fmax * rng.uniform(size=(n_samples, 2))
+    y = rng.standard_normal(size=n_samples)
+
+    clf = MLPRegressor()
+    msg = (
+        "Solver produced non-finite parameter weights. The input data may contain large"
+        " values and need to be preprocessed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_predict_proba_binary():
+    # Test that predict_proba works as expected for binary class.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+
+    clf = MLPClassifier(hidden_layer_sizes=5, activation="logistic", random_state=1)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], 2
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+    assert roc_auc_score(y, y_proba[:, 1]) == 1.0
+
+
+def test_predict_proba_multiclass():
+    # Test that predict_proba works as expected for multi class.
+    X = X_digits_multi[:10]
+    y = y_digits_multi[:10]
+
+    clf = MLPClassifier(hidden_layer_sizes=5)
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    y_proba = clf.predict_proba(X)
+    y_log_proba = clf.predict_log_proba(X)
+
+    (n_samples, n_classes) = y.shape[0], np.unique(y).size
+
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_predict_proba_multilabel():
+    # Test that predict_proba works as expected for multilabel.
+    # Multilabel should not use softmax which makes probabilities sum to 1
+    X, Y = make_multilabel_classification(
+        n_samples=50, random_state=0, return_indicator=True
+    )
+    n_samples, n_classes = Y.shape
+
+    clf = MLPClassifier(solver="lbfgs", hidden_layer_sizes=30, random_state=0)
+    clf.fit(X, Y)
+    y_proba = clf.predict_proba(X)
+
+    assert y_proba.shape == (n_samples, n_classes)
+    assert_array_equal(y_proba > 0.5, Y)
+
+    y_log_proba = clf.predict_log_proba(X)
+    proba_max = y_proba.argmax(axis=1)
+    proba_log_max = y_log_proba.argmax(axis=1)
+
+    assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10
+    assert_array_equal(proba_max, proba_log_max)
+    assert_allclose(y_log_proba, np.log(y_proba))
+
+
+def test_shuffle():
+    # Test that the shuffle parameter affects the training process (it should)
+    X, y = make_regression(n_samples=50, n_features=5, n_targets=1, random_state=0)
+
+    # The coefficients will be identical if both do or do not shuffle
+    for shuffle in [True, False]:
+        mlp1 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        mlp2 = MLPRegressor(
+            hidden_layer_sizes=1,
+            max_iter=1,
+            batch_size=1,
+            random_state=0,
+            shuffle=shuffle,
+        )
+        mlp1.fit(X, y)
+        mlp2.fit(X, y)
+
+        assert np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+    # The coefficients will be slightly different if shuffle=True
+    mlp1 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=True
+    )
+    mlp2 = MLPRegressor(
+        hidden_layer_sizes=1, max_iter=1, batch_size=1, random_state=0, shuffle=False
+    )
+    mlp1.fit(X, y)
+    mlp2.fit(X, y)
+
+    assert not np.array_equal(mlp1.coefs_[0], mlp2.coefs_[0])
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_sparse_matrices(csr_container):
+    # Test that sparse and dense input matrices output the same results.
+    X = X_digits_binary[:50]
+    y = y_digits_binary[:50]
+    X_sparse = csr_container(X)
+    mlp = MLPClassifier(solver="lbfgs", hidden_layer_sizes=15, random_state=1)
+    mlp.fit(X, y)
+    pred1 = mlp.predict(X)
+    mlp.fit(X_sparse, y)
+    pred2 = mlp.predict(X_sparse)
+    assert_almost_equal(pred1, pred2)
+    pred1 = mlp.predict(X)
+    pred2 = mlp.predict(X_sparse)
+    assert_array_equal(pred1, pred2)
+
+
+def test_tolerance():
+    # Test tolerance.
+    # It should force the solver to exit the loop when it converges.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+
+
+def test_verbose_sgd():
+    # Test verbose.
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(solver="sgd", max_iter=2, verbose=10, hidden_layer_sizes=2)
+    old_stdout = sys.stdout
+    sys.stdout = output = StringIO()
+
+    with ignore_warnings(category=ConvergenceWarning):
+        clf.fit(X, y)
+    clf.partial_fit(X, y)
+
+    sys.stdout = old_stdout
+    assert "Iteration" in output.getvalue()
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_early_stopping(MLPEstimator):
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.2
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=True
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.max_iter > mlp_estimator.n_iter_
+
+    assert mlp_estimator.best_loss_ is None
+    assert isinstance(mlp_estimator.validation_scores_, list)
+
+    valid_scores = mlp_estimator.validation_scores_
+    best_valid_score = mlp_estimator.best_validation_score_
+    assert max(valid_scores) == best_valid_score
+    assert best_valid_score + tol > valid_scores[-2]
+    assert best_valid_score + tol > valid_scores[-1]
+
+    # check that the attributes `validation_scores_` and `best_validation_score_`
+    # are set to None when `early_stopping=False`
+    mlp_estimator = MLPEstimator(
+        tol=tol, max_iter=3000, solver="sgd", early_stopping=False
+    )
+    mlp_estimator.fit(X, y)
+    assert mlp_estimator.validation_scores_ is None
+    assert mlp_estimator.best_validation_score_ is None
+    assert mlp_estimator.best_loss_ is not None
+
+
+def test_adaptive_learning_rate():
+    X = [[3, 2], [1, 6]]
+    y = [1, 0]
+    clf = MLPClassifier(tol=0.5, max_iter=3000, solver="sgd", learning_rate="adaptive")
+    clf.fit(X, y)
+    assert clf.max_iter > clf.n_iter_
+    assert 1e-6 > clf._optimizer.learning_rate
+
+
+@ignore_warnings(category=RuntimeWarning)
+def test_warm_start():
+    X = X_iris
+    y = y_iris
+
+    y_2classes = np.array([0] * 75 + [1] * 75)
+    y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70)
+    y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50)
+    y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38)
+    y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30)
+
+    # No error raised
+    clf = MLPClassifier(hidden_layer_sizes=2, solver="lbfgs", warm_start=True).fit(X, y)
+    clf.fit(X, y)
+    clf.fit(X, y_3classes)
+
+    for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes):
+        clf = MLPClassifier(hidden_layer_sizes=2, solver="lbfgs", warm_start=True).fit(
+            X, y
+        )
+        message = (
+            "warm_start can only be used where `y` has the same "
+            "classes as in the previous call to fit."
+            " Previously got [0 1 2], `y` has %s" % np.unique(y_i)
+        )
+        with pytest.raises(ValueError, match=re.escape(message)):
+            clf.fit(X, y_i)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_warm_start_full_iteration(MLPEstimator):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/16812
+    # Check that the MLP estimator accomplish `max_iter` with a
+    # warm started estimator.
+    X, y = X_iris, y_iris
+    max_iter = 3
+    clf = MLPEstimator(
+        hidden_layer_sizes=2, solver="sgd", warm_start=True, max_iter=max_iter
+    )
+    clf.fit(X, y)
+    assert max_iter == clf.n_iter_
+    clf.fit(X, y)
+    assert max_iter == clf.n_iter_
+
+
+def test_n_iter_no_change():
+    # test n_iter_no_change using binary data set
+    # the classifying fitting process is not prone to loss curve fluctuations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+    tol = 0.01
+    max_iter = 3000
+
+    # test multiple n_iter_no_change
+    for n_iter_no_change in [2, 5, 10, 50, 100]:
+        clf = MLPClassifier(
+            tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+        )
+        clf.fit(X, y)
+
+        # validate n_iter_no_change
+        assert clf._no_improvement_count == n_iter_no_change + 1
+        assert max_iter > clf.n_iter_
+
+
+@ignore_warnings(category=ConvergenceWarning)
+def test_n_iter_no_change_inf():
+    # test n_iter_no_change using binary data set
+    # the fitting process should go to max_iter iterations
+    X = X_digits_binary[:100]
+    y = y_digits_binary[:100]
+
+    # set a ridiculous tolerance
+    # this should always trigger _update_no_improvement_count()
+    tol = 1e9
+
+    # fit
+    n_iter_no_change = np.inf
+    max_iter = 3000
+    clf = MLPClassifier(
+        tol=tol, max_iter=max_iter, solver="sgd", n_iter_no_change=n_iter_no_change
+    )
+    clf.fit(X, y)
+
+    # validate n_iter_no_change doesn't cause early stopping
+    assert clf.n_iter_ == max_iter
+
+    # validate _update_no_improvement_count() was always triggered
+    assert clf._no_improvement_count == clf.n_iter_ - 1
+
+
+def test_early_stopping_stratified():
+    # Make sure data splitting for early stopping is stratified
+    X = [[1, 2], [2, 3], [3, 4], [4, 5]]
+    y = [0, 0, 0, 1]
+
+    mlp = MLPClassifier(early_stopping=True)
+    with pytest.raises(
+        ValueError, match="The least populated class in y has only 1 member"
+    ):
+        mlp.fit(X, y)
+
+
+def test_mlp_classifier_dtypes_casting():
+    # Compare predictions for different dtypes
+    mlp_64 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+    proba_64 = mlp_64.predict_proba(X_digits[300:])
+
+    mlp_32 = MLPClassifier(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+    proba_32 = mlp_32.predict_proba(X_digits[300:].astype(np.float32))
+
+    assert_array_equal(pred_64, pred_32)
+    assert_allclose(proba_64, proba_32, rtol=1e-02)
+
+
+def test_mlp_regressor_dtypes_casting():
+    mlp_64 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50
+    )
+    mlp_64.fit(X_digits[:300], y_digits[:300])
+    pred_64 = mlp_64.predict(X_digits[300:])
+
+    mlp_32 = MLPRegressor(
+        alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50
+    )
+    mlp_32.fit(X_digits[:300].astype(np.float32), y_digits[:300])
+    pred_32 = mlp_32.predict(X_digits[300:].astype(np.float32))
+
+    assert_allclose(pred_64, pred_32, rtol=1e-04)
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_mlp_param_dtypes(dtype, Estimator):
+    # Checks if input dtype is used for network parameters
+    # and predictions
+    X, y = X_digits.astype(dtype), y_digits
+    mlp = Estimator(alpha=1e-5, hidden_layer_sizes=(5, 3), random_state=1, max_iter=50)
+    mlp.fit(X[:300], y[:300])
+    pred = mlp.predict(X[300:])
+
+    assert all([intercept.dtype == dtype for intercept in mlp.intercepts_])
+
+    assert all([coef.dtype == dtype for coef in mlp.coefs_])
+
+    if Estimator == MLPRegressor:
+        assert pred.dtype == dtype
+
+
+def test_mlp_loading_from_joblib_partial_fit(tmp_path):
+    """Loading from MLP and partial fitting updates weights. Non-regression
+    test for #19626."""
+    pre_trained_estimator = MLPRegressor(
+        hidden_layer_sizes=(42,), random_state=42, learning_rate_init=0.01, max_iter=200
+    )
+    features, target = [[2]], [4]
+
+    # Fit on x=2, y=4
+    pre_trained_estimator.fit(features, target)
+
+    # dump and load model
+    pickled_file = tmp_path / "mlp.pkl"
+    joblib.dump(pre_trained_estimator, pickled_file)
+    load_estimator = joblib.load(pickled_file)
+
+    # Train for a more epochs on point x=2, y=1
+    fine_tune_features, fine_tune_target = [[2]], [1]
+
+    for _ in range(200):
+        load_estimator.partial_fit(fine_tune_features, fine_tune_target)
+
+    # finetuned model learned the new target
+    predicted_value = load_estimator.predict(fine_tune_features)
+    assert_allclose(predicted_value, fine_tune_target, rtol=1e-4)
+
+
+@pytest.mark.parametrize("Estimator", [MLPClassifier, MLPRegressor])
+def test_preserve_feature_names(Estimator):
+    """Check that feature names are preserved when early stopping is enabled.
+
+    Feature names are required for consistency checks during scoring.
+
+    Non-regression test for gh-24846
+    """
+    pd = pytest.importorskip("pandas")
+    rng = np.random.RandomState(0)
+
+    X = pd.DataFrame(data=rng.randn(10, 2), columns=["colname_a", "colname_b"])
+    y = pd.Series(data=np.full(10, 1), name="colname_y")
+
+    model = Estimator(early_stopping=True, validation_fraction=0.2)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        model.fit(X, y)
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_warm_start_with_early_stopping(MLPEstimator):
+    """Check that early stopping works with warm start."""
+    mlp = MLPEstimator(
+        max_iter=10, random_state=0, warm_start=True, early_stopping=True
+    )
+    mlp.fit(X_iris, y_iris)
+    n_validation_scores = len(mlp.validation_scores_)
+    mlp.set_params(max_iter=20)
+    mlp.fit(X_iris, y_iris)
+    assert len(mlp.validation_scores_) > n_validation_scores
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+@pytest.mark.parametrize("solver", ["sgd", "adam", "lbfgs"])
+def test_mlp_warm_start_no_convergence(MLPEstimator, solver):
+    """Check that we stop the number of iteration at `max_iter` when warm starting.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/24764
+    """
+    model = MLPEstimator(
+        solver=solver,
+        warm_start=True,
+        early_stopping=False,
+        max_iter=10,
+        n_iter_no_change=np.inf,
+        random_state=0,
+    )
+
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 10
+
+    model.set_params(max_iter=20)
+    with pytest.warns(ConvergenceWarning):
+        model.fit(X_iris, y_iris)
+    assert model.n_iter_ == 20
+
+
+@pytest.mark.parametrize("MLPEstimator", [MLPClassifier, MLPRegressor])
+def test_mlp_partial_fit_after_fit(MLPEstimator):
+    """Check partial fit does not fail after fit when early_stopping=True.
+
+    Non-regression test for gh-25693.
+    """
+    mlp = MLPEstimator(early_stopping=True, random_state=0).fit(X_iris, y_iris)
+
+    msg = "partial_fit does not support early_stopping=True"
+    with pytest.raises(ValueError, match=msg):
+        mlp.partial_fit(X_iris, y_iris)
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_rbm.py
@@ -0,0 +1,251 @@
+import re
+import sys
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_digits
+from sklearn.neural_network import BernoulliRBM
+from sklearn.utils._testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+)
+from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
+from sklearn.utils.validation import assert_all_finite
+
+Xdigits, _ = load_digits(return_X_y=True)
+Xdigits -= Xdigits.min()
+Xdigits /= Xdigits.max()
+
+
+def test_fit():
+    X = Xdigits.copy()
+
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9
+    )
+    rbm.fit(X)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+
+    # in-place tricks shouldn't have modified X
+    assert_array_equal(X, Xdigits)
+
+
+def test_partial_fit():
+    X = Xdigits.copy()
+    rbm = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=20, random_state=9
+    )
+    n_samples = X.shape[0]
+    n_batches = int(np.ceil(float(n_samples) / rbm.batch_size))
+    batch_slices = np.array_split(X, n_batches)
+
+    for i in range(7):
+        for batch in batch_slices:
+            rbm.partial_fit(batch)
+
+    assert_almost_equal(rbm.score_samples(X).mean(), -21.0, decimal=0)
+    assert_array_equal(X, Xdigits)
+
+
+def test_transform():
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    Xt1 = rbm1.transform(X)
+    Xt2 = rbm1._mean_hiddens(X)
+
+    assert_array_equal(Xt1, Xt2)
+
+
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_small_sparse(csr_container):
+    # BernoulliRBM should work on small sparse matrices.
+    X = csr_container(Xdigits[:4])
+    BernoulliRBM().fit(X)  # no exception
+
+
+@pytest.mark.parametrize("sparse_container", CSC_CONTAINERS + CSR_CONTAINERS)
+def test_small_sparse_partial_fit(sparse_container):
+    X_sparse = sparse_container(Xdigits[:100])
+    X = Xdigits[:100].copy()
+
+    rbm1 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+    rbm2 = BernoulliRBM(
+        n_components=64, learning_rate=0.1, batch_size=10, random_state=9
+    )
+
+    rbm1.partial_fit(X_sparse)
+    rbm2.partial_fit(X)
+
+    assert_almost_equal(
+        rbm1.score_samples(X).mean(), rbm2.score_samples(X).mean(), decimal=0
+    )
+
+
+def test_sample_hiddens():
+    rng = np.random.RandomState(0)
+    X = Xdigits[:100]
+    rbm1 = BernoulliRBM(n_components=2, batch_size=5, n_iter=5, random_state=42)
+    rbm1.fit(X)
+
+    h = rbm1._mean_hiddens(X[0])
+    hs = np.mean([rbm1._sample_hiddens(X[0], rng) for i in range(100)], 0)
+
+    assert_almost_equal(h, hs, decimal=1)
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_fit_gibbs(csc_container):
+    # XXX: this test is very seed-dependent! It probably needs to be rewritten.
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]]
+    # from the same input
+    rng = np.random.RandomState(42)
+    X = np.array([[0.0], [1.0]])
+    rbm1 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    # you need that much iters
+    rbm1.fit(X)
+    assert_almost_equal(
+        rbm1.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm1.gibbs(X), X)
+
+    # Gibbs on the RBM hidden layer should be able to recreate [[0], [1]] from
+    # the same input even when the input is sparse, and test against non-sparse
+    rng = np.random.RandomState(42)
+    X = csc_container([[0.0], [1.0]])
+    rbm2 = BernoulliRBM(n_components=2, batch_size=2, n_iter=42, random_state=rng)
+    rbm2.fit(X)
+    assert_almost_equal(
+        rbm2.components_, np.array([[0.02649814], [0.02009084]]), decimal=4
+    )
+    assert_almost_equal(rbm2.gibbs(X), X.toarray())
+    assert_almost_equal(rbm1.components_, rbm2.components_)
+
+
+def test_gibbs_smoke():
+    # Check if we don't get NaNs sampling the full digits dataset.
+    # Also check that sampling again will yield different results.
+    X = Xdigits
+    rbm1 = BernoulliRBM(n_components=42, batch_size=40, n_iter=20, random_state=42)
+    rbm1.fit(X)
+    X_sampled = rbm1.gibbs(X)
+    assert_all_finite(X_sampled)
+    X_sampled2 = rbm1.gibbs(X)
+    assert np.all((X_sampled != X_sampled2).max(axis=1))
+
+
+@pytest.mark.parametrize("lil_containers", LIL_CONTAINERS)
+def test_score_samples(lil_containers):
+    # Test score_samples (pseudo-likelihood) method.
+    # Assert that pseudo-likelihood is computed without clipping.
+    # See Fabian's blog, http://bit.ly/1iYefRk
+    rng = np.random.RandomState(42)
+    X = np.vstack([np.zeros(1000), np.ones(1000)])
+    rbm1 = BernoulliRBM(n_components=10, batch_size=2, n_iter=10, random_state=rng)
+    rbm1.fit(X)
+    assert (rbm1.score_samples(X) < -300).all()
+
+    # Sparse vs. dense should not affect the output. Also test sparse input
+    # validation.
+    rbm1.random_state = 42
+    d_score = rbm1.score_samples(X)
+    rbm1.random_state = 42
+    s_score = rbm1.score_samples(lil_containers(X))
+    assert_almost_equal(d_score, s_score)
+
+    # Test numerical stability (#2785): would previously generate infinities
+    # and crash with an exception.
+    with np.errstate(under="ignore"):
+        rbm1.score_samples([np.arange(1000) * 100])
+
+
+def test_rbm_verbose():
+    rbm = BernoulliRBM(n_iter=2, verbose=10)
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        rbm.fit(Xdigits)
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
+def test_sparse_and_verbose(csc_container):
+    # Make sure RBM works with sparse input when verbose=True
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+
+    X = csc_container([[0.0], [1.0]])
+    rbm = BernoulliRBM(
+        n_components=2, batch_size=2, n_iter=1, random_state=42, verbose=True
+    )
+    try:
+        rbm.fit(X)
+        s = sys.stdout.getvalue()
+        # make sure output is sound
+        assert re.match(
+            r"\[BernoulliRBM\] Iteration 1,"
+            r" pseudo-likelihood = -?(\d)+(\.\d+)?,"
+            r" time = (\d|\.)+s",
+            s,
+        )
+    finally:
+        sys.stdout = old_stdout
+
+
+@pytest.mark.parametrize(
+    "dtype_in, dtype_out",
+    [(np.float32, np.float32), (np.float64, np.float64), (int, np.float64)],
+)
+def test_transformer_dtypes_casting(dtype_in, dtype_out):
+    X = Xdigits[:100].astype(dtype_in)
+    rbm = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt = rbm.fit_transform(X)
+
+    # dtype_in and dtype_out should be consistent
+    assert Xt.dtype == dtype_out, "transform dtype: {} - original dtype: {}".format(
+        Xt.dtype, X.dtype
+    )
+
+
+def test_convergence_dtype_consistency():
+    # float 64 transformer
+    X_64 = Xdigits[:100].astype(np.float64)
+    rbm_64 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_64 = rbm_64.fit_transform(X_64)
+
+    # float 32 transformer
+    X_32 = Xdigits[:100].astype(np.float32)
+    rbm_32 = BernoulliRBM(n_components=16, batch_size=5, n_iter=5, random_state=42)
+    Xt_32 = rbm_32.fit_transform(X_32)
+
+    # results and attributes should be close enough in 32 bit and 64 bit
+    assert_allclose(Xt_64, Xt_32, rtol=1e-06, atol=0)
+    assert_allclose(
+        rbm_64.intercept_hidden_, rbm_32.intercept_hidden_, rtol=1e-06, atol=0
+    )
+    assert_allclose(
+        rbm_64.intercept_visible_, rbm_32.intercept_visible_, rtol=1e-05, atol=0
+    )
+    assert_allclose(rbm_64.components_, rbm_32.components_, rtol=1e-03, atol=0)
+    assert_allclose(rbm_64.h_samples_, rbm_32.h_samples_)
+
+
+@pytest.mark.parametrize("method", ["fit", "partial_fit"])
+def test_feature_names_out(method):
+    """Check `get_feature_names_out` for `BernoulliRBM`."""
+    n_components = 10
+    rbm = BernoulliRBM(n_components=n_components)
+    getattr(rbm, method)(Xdigits)
+
+    names = rbm.get_feature_names_out()
+    expected_names = [f"bernoullirbm{i}" for i in range(n_components)]
+    assert_array_equal(expected_names, names)
--- a/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/neural_network/tests/test_stochastic_optimizers.py
@@ -0,0 +1,112 @@
+import numpy as np
+
+from sklearn.neural_network._stochastic_optimizers import (
+    AdamOptimizer,
+    BaseOptimizer,
+    SGDOptimizer,
+)
+from sklearn.utils._testing import assert_array_equal
+
+shapes = [(4, 6), (6, 8), (7, 8, 9)]
+
+
+def test_base_optimizer():
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = BaseOptimizer(lr)
+        assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_no_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    rng = np.random.RandomState(0)
+
+    for lr in [10**i for i in range(-3, 4)]:
+        optimizer = SGDOptimizer(params, lr, momentum=0, nesterov=False)
+        grads = [rng.random_sample(shape) for shape in shapes]
+        expected = [param - lr * grad for param, grad in zip(params, grads)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=False)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_sgd_optimizer_trigger_stopping():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 2e-6
+    optimizer = SGDOptimizer(params, lr, lr_schedule="adaptive")
+    assert not optimizer.trigger_stopping("", False)
+    assert lr / 5 == optimizer.learning_rate
+    assert optimizer.trigger_stopping("", False)
+
+
+def test_sgd_optimizer_nesterovs_momentum():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.1
+    rng = np.random.RandomState(0)
+
+    for momentum in np.arange(0.5, 0.9, 0.1):
+        optimizer = SGDOptimizer(params, lr, momentum=momentum, nesterov=True)
+        velocities = [rng.random_sample(shape) for shape in shapes]
+        optimizer.velocities = velocities
+        grads = [rng.random_sample(shape) for shape in shapes]
+        updates = [
+            momentum * velocity - lr * grad for velocity, grad in zip(velocities, grads)
+        ]
+        updates = [
+            momentum * update - lr * grad for update, grad in zip(updates, grads)
+        ]
+        expected = [param + update for param, update in zip(params, updates)]
+        optimizer.update_params(params, grads)
+
+        for exp, param in zip(expected, params):
+            assert_array_equal(exp, param)
+
+
+def test_adam_optimizer():
+    params = [np.zeros(shape) for shape in shapes]
+    lr = 0.001
+    epsilon = 1e-8
+    rng = np.random.RandomState(0)
+
+    for beta_1 in np.arange(0.9, 1.0, 0.05):
+        for beta_2 in np.arange(0.995, 1.0, 0.001):
+            optimizer = AdamOptimizer(params, lr, beta_1, beta_2, epsilon)
+            ms = [rng.random_sample(shape) for shape in shapes]
+            vs = [rng.random_sample(shape) for shape in shapes]
+            t = 10
+            optimizer.ms = ms
+            optimizer.vs = vs
+            optimizer.t = t - 1
+            grads = [rng.random_sample(shape) for shape in shapes]
+
+            ms = [beta_1 * m + (1 - beta_1) * grad for m, grad in zip(ms, grads)]
+            vs = [beta_2 * v + (1 - beta_2) * (grad**2) for v, grad in zip(vs, grads)]
+            learning_rate = lr * np.sqrt(1 - beta_2**t) / (1 - beta_1**t)
+            updates = [
+                -learning_rate * m / (np.sqrt(v) + epsilon) for m, v in zip(ms, vs)
+            ]
+            expected = [param + update for param, update in zip(params, updates)]
+
+            optimizer.update_params(params, grads)
+            for exp, param in zip(expected, params):
+                assert_array_equal(exp, param)