library packages

2024-09-28 22:56:00 -07:00
parent 64d9b78b3a
commit 1973934e95
4893 changed files with 1184173 additions and 31 deletions
--- a/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
+++ b/.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
@@ -0,0 +1,466 @@
+# Authors: Gilles Louppe <g.louppe@gmail.com>
+#          Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#          Arnaud Joly <arnaud.v.joly@gmail.com>
+#          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Nelson Liu <nelson@nelsonliu.me>
+#
+#
+# License: BSD 3 clause
+
+from libc.stdlib cimport free
+from libc.stdlib cimport realloc
+from libc.math cimport log as ln
+from libc.math cimport isnan
+
+import numpy as np
+cimport numpy as cnp
+cnp.import_array()
+
+from ..utils._random cimport our_rand_r
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
+    # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
+    # 0.20.1 to crash.
+    cdef size_t nbytes = nelems * sizeof(p[0][0])
+    if nbytes / sizeof(p[0][0]) != nelems:
+        # Overflow in the multiplication
+        raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes")
+
+    cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
+    if tmp == NULL:
+        raise MemoryError(f"could not allocate {nbytes} bytes")
+
+    p[0] = tmp
+    return 0
+
+
+def _realloc_test():
+    # Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
+    # bytes, which will always overflow.
+    cdef intp_t* p = NULL
+    safe_realloc(&p, <size_t>(-1) / 2)
+    if p != NULL:
+        free(p)
+        assert False
+
+
+cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
+    """Return copied data as 1D numpy array of intp's."""
+    cdef cnp.npy_intp shape[1]
+    shape[0] = <cnp.npy_intp> size
+    return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
+
+
+cdef inline intp_t rand_int(intp_t low, intp_t high,
+                            uint32_t* random_state) noexcept nogil:
+    """Generate a random integer in [low; end)."""
+    return low + our_rand_r(random_state) % (high - low)
+
+
+cdef inline float64_t rand_uniform(float64_t low, float64_t high,
+                                   uint32_t* random_state) noexcept nogil:
+    """Generate a random float64_t in [low; high)."""
+    return ((high - low) * <float64_t> our_rand_r(random_state) /
+            <float64_t> RAND_R_MAX) + low
+
+
+cdef inline float64_t log(float64_t x) noexcept nogil:
+    return ln(x) / ln(2.0)
+
+# =============================================================================
+# WeightedPQueue data structure
+# =============================================================================
+
+cdef class WeightedPQueue:
+    """A priority queue class, always sorted in increasing order.
+
+    Attributes
+    ----------
+    capacity : intp_t
+        The capacity of the priority queue.
+
+    array_ptr : intp_t
+        The water mark of the priority queue; the priority queue grows from
+        left to right in the array ``array_``. ``array_ptr`` is always
+        less than ``capacity``.
+
+    array_ : WeightedPQueueRecord*
+        The array of priority queue records. The minimum element is on the
+        left at index 0, and the maximum element is on the right at index
+        ``array_ptr-1``.
+    """
+
+    def __cinit__(self, intp_t capacity):
+        self.capacity = capacity
+        self.array_ptr = 0
+        safe_realloc(&self.array_, capacity)
+
+    def __dealloc__(self):
+        free(self.array_)
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the WeightedPQueue to its state at construction
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        self.array_ptr = 0
+        # Since safe_realloc can raise MemoryError, use `except -1`
+        safe_realloc(&self.array_, self.capacity)
+        return 0
+
+    cdef bint is_empty(self) noexcept nogil:
+        return self.array_ptr <= 0
+
+    cdef intp_t size(self) noexcept nogil:
+        return self.array_ptr
+
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+        """Push record on the array.
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = NULL
+        cdef intp_t i
+
+        # Resize if capacity not sufficient
+        if array_ptr >= self.capacity:
+            self.capacity *= 2
+            # Since safe_realloc can raise MemoryError, use `except -1`
+            safe_realloc(&self.array_, self.capacity)
+
+        # Put element as last element of array
+        array = self.array_
+        array[array_ptr].data = data
+        array[array_ptr].weight = weight
+
+        # bubble last element up according until it is sorted
+        # in ascending order
+        i = array_ptr
+        while(i != 0 and array[i].data < array[i-1].data):
+            array[i], array[i-1] = array[i-1], array[i]
+            i -= 1
+
+        # Increase element count
+        self.array_ptr = array_ptr + 1
+        return 0
+
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+        """Remove a specific value/weight record from the array.
+        Returns 0 if successful, -1 if record not found."""
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+        cdef intp_t idx_to_remove = -1
+        cdef intp_t i
+
+        if array_ptr <= 0:
+            return -1
+
+        # find element to remove
+        for i in range(array_ptr):
+            if array[i].data == data and array[i].weight == weight:
+                idx_to_remove = i
+                break
+
+        if idx_to_remove == -1:
+            return -1
+
+        # shift the elements after the removed element
+        # to the left.
+        for i in range(idx_to_remove, array_ptr-1):
+            array[i] = array[i+1]
+
+        self.array_ptr = array_ptr - 1
+        return 0
+
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Remove the top (minimum) element from array.
+        Returns 0 if successful, -1 if nothing to remove."""
+        cdef intp_t array_ptr = self.array_ptr
+        cdef WeightedPQueueRecord* array = self.array_
+        cdef intp_t i
+
+        if array_ptr <= 0:
+            return -1
+
+        data[0] = array[0].data
+        weight[0] = array[0].weight
+
+        # shift the elements after the removed element
+        # to the left.
+        for i in range(0, array_ptr-1):
+            array[i] = array[i+1]
+
+        self.array_ptr = array_ptr - 1
+        return 0
+
+    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Write the top element from array to a pointer.
+        Returns 0 if successful, -1 if nothing to write."""
+        cdef WeightedPQueueRecord* array = self.array_
+        if self.array_ptr <= 0:
+            return -1
+        # Take first value
+        data[0] = array[0].data
+        weight[0] = array[0].weight
+        return 0
+
+    cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested weight"""
+        cdef WeightedPQueueRecord* array = self.array_
+
+        # get weight at index
+        return array[index].weight
+
+    cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil:
+        """Given an index between [0,self.current_capacity], access
+        the appropriate heap and return the requested value"""
+        cdef WeightedPQueueRecord* array = self.array_
+
+        # get value at index
+        return array[index].data
+
+# =============================================================================
+# WeightedMedianCalculator data structure
+# =============================================================================
+
+cdef class WeightedMedianCalculator:
+    """A class to handle calculation of the weighted median from streams of
+    data. To do so, it maintains a parameter ``k`` such that the sum of the
+    weights in the range [0,k) is greater than or equal to half of the total
+    weight. By minimizing the value of ``k`` that fulfills this constraint,
+    calculating the median is done by either taking the value of the sample
+    at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of
+    the samples at index ``k-1`` and ``k`` of ``samples``
+    ((samples[k-1] + samples[k]) / 2).
+
+    Attributes
+    ----------
+    initial_capacity : intp_t
+        The initial capacity of the WeightedMedianCalculator.
+
+    samples : WeightedPQueue
+        Holds the samples (consisting of values and their weights) used in the
+        weighted median calculation.
+
+    total_weight : float64_t
+        The sum of the weights of items in ``samples``. Represents the total
+        weight of all samples used in the median calculation.
+
+    k : intp_t
+        Index used to calculate the median.
+
+    sum_w_0_k : float64_t
+        The sum of the weights from samples[0:k]. Used in the weighted
+        median calculation; minimizing the value of ``k`` such that
+        ``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for
+        calculating the median in constant time.
+
+    """
+
+    def __cinit__(self, intp_t initial_capacity):
+        self.initial_capacity = initial_capacity
+        self.samples = WeightedPQueue(initial_capacity)
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
+
+    cdef intp_t size(self) noexcept nogil:
+        """Return the number of samples in the
+        WeightedMedianCalculator"""
+        return self.samples.size()
+
+    cdef int reset(self) except -1 nogil:
+        """Reset the WeightedMedianCalculator to its state at construction
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        # samples.reset (WeightedPQueue.reset) uses safe_realloc, hence
+        # except -1
+        self.samples.reset()
+        self.total_weight = 0
+        self.k = 0
+        self.sum_w_0_k = 0
+        return 0
+
+    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+        """Push a value and its associated weight to the WeightedMedianCalculator
+
+        Return -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+        # samples.push (WeightedPQueue.push) uses safe_realloc, hence except -1
+        return_value = self.samples.push(data, weight)
+        self.update_median_parameters_post_push(data, weight,
+                                                original_median)
+        return return_value
+
+    cdef int update_median_parameters_post_push(
+            self, float64_t data, float64_t weight,
+            float64_t original_median) noexcept nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after an insertion"""
+
+        # trivial case of one element.
+        if self.size() == 1:
+            self.k = 1
+            self.total_weight = weight
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the original weighted median
+        self.total_weight += weight
+
+        if data < original_median:
+            # inserting below the median, so increment k and
+            # then update self.sum_w_0_k accordingly by adding
+            # the weight that was added.
+            self.k += 1
+            # update sum_w_0_k by adding the weight added
+            self.sum_w_0_k += weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # minimum value of k is 1
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                   self.samples.get_weight_from_index(self.k-1))
+                                  >= self.total_weight / 2.0)):
+                self.k -= 1
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
+            return 0
+
+        if data >= original_median:
+            # inserting above or at the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
+                self.k += 1
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
+            return 0
+
+    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+        """Remove a value from the MedianHeap, removing it
+        from consideration in the median calculation
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+
+        return_value = self.samples.remove(data, weight)
+        self.update_median_parameters_post_remove(data, weight,
+                                                  original_median)
+        return return_value
+
+    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+        """Pop a value from the MedianHeap, starting from the
+        left and moving to the right.
+        """
+        cdef int return_value
+        cdef float64_t original_median = 0.0
+
+        if self.size() != 0:
+            original_median = self.get_median()
+
+        # no elements to pop
+        if self.samples.size() == 0:
+            return -1
+
+        return_value = self.samples.pop(data, weight)
+        self.update_median_parameters_post_remove(data[0],
+                                                  weight[0],
+                                                  original_median)
+        return return_value
+
+    cdef int update_median_parameters_post_remove(
+            self, float64_t data, float64_t weight,
+            float64_t original_median) noexcept nogil:
+        """Update the parameters used in the median calculation,
+        namely `k` and `sum_w_0_k` after a removal"""
+        # reset parameters because it there are no elements
+        if self.samples.size() == 0:
+            self.k = 0
+            self.total_weight = 0
+            self.sum_w_0_k = 0
+            return 0
+
+        # trivial case of one element.
+        if self.samples.size() == 1:
+            self.k = 1
+            self.total_weight -= weight
+            self.sum_w_0_k = self.total_weight
+            return 0
+
+        # get the current weighted median
+        self.total_weight -= weight
+
+        if data < original_median:
+            # removing below the median, so decrement k and
+            # then update self.sum_w_0_k accordingly by subtracting
+            # the removed weight
+
+            self.k -= 1
+            # update sum_w_0_k by removing the weight at index k
+            self.sum_w_0_k -= weight
+
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            # by incrementing k and updating sum_w_0_k accordingly
+            # until the condition is met.
+            while(self.k < self.samples.size() and
+                  (self.sum_w_0_k < self.total_weight / 2.0)):
+                self.k += 1
+                self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
+            return 0
+
+        if data >= original_median:
+            # removing above the median
+            # minimize k such that sum(W[0:k]) >= total_weight / 2
+            while(self.k > 1 and ((self.sum_w_0_k -
+                                   self.samples.get_weight_from_index(self.k-1))
+                                  >= self.total_weight / 2.0)):
+                self.k -= 1
+                self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
+            return 0
+
+    cdef float64_t get_median(self) noexcept nogil:
+        """Write the median to a pointer, taking into account
+        sample weights."""
+        if self.sum_w_0_k == (self.total_weight / 2.0):
+            # split median
+            return (self.samples.get_value_from_index(self.k) +
+                    self.samples.get_value_from_index(self.k-1)) / 2.0
+        if self.sum_w_0_k > (self.total_weight / 2.0):
+            # whole median
+            return self.samples.get_value_from_index(self.k-1)
+
+
+def _any_isnan_axis0(const float32_t[:, :] X):
+    """Same as np.any(np.isnan(X), axis=0)"""
+    cdef:
+        intp_t i, j
+        intp_t n_samples = X.shape[0]
+        intp_t n_features = X.shape[1]
+        unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)
+
+    with nogil:
+        for i in range(n_samples):
+            for j in range(n_features):
+                if isnan_out[j]:
+                    continue
+                if isnan(X[i, j]):
+                    isnan_out[j] = True
+                    break
+    return np.asarray(isnan_out)