library packages
This commit is contained in:
466
.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
Normal file
466
.venv/lib/python3.12/site-packages/sklearn/tree/_utils.pyx
Normal file
@@ -0,0 +1,466 @@
|
||||
# Authors: Gilles Louppe <g.louppe@gmail.com>
|
||||
# Peter Prettenhofer <peter.prettenhofer@gmail.com>
|
||||
# Arnaud Joly <arnaud.v.joly@gmail.com>
|
||||
# Jacob Schreiber <jmschreiber91@gmail.com>
|
||||
# Nelson Liu <nelson@nelsonliu.me>
|
||||
#
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
from libc.stdlib cimport free
|
||||
from libc.stdlib cimport realloc
|
||||
from libc.math cimport log as ln
|
||||
from libc.math cimport isnan
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as cnp
|
||||
cnp.import_array()
|
||||
|
||||
from ..utils._random cimport our_rand_r
|
||||
|
||||
# =============================================================================
|
||||
# Helper functions
|
||||
# =============================================================================
|
||||
|
||||
cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
|
||||
# sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
|
||||
# 0.20.1 to crash.
|
||||
cdef size_t nbytes = nelems * sizeof(p[0][0])
|
||||
if nbytes / sizeof(p[0][0]) != nelems:
|
||||
# Overflow in the multiplication
|
||||
raise MemoryError(f"could not allocate ({nelems} * {sizeof(p[0][0])}) bytes")
|
||||
|
||||
cdef realloc_ptr tmp = <realloc_ptr>realloc(p[0], nbytes)
|
||||
if tmp == NULL:
|
||||
raise MemoryError(f"could not allocate {nbytes} bytes")
|
||||
|
||||
p[0] = tmp
|
||||
return 0
|
||||
|
||||
|
||||
def _realloc_test():
|
||||
# Helper for tests. Tries to allocate <size_t>(-1) / 2 * sizeof(size_t)
|
||||
# bytes, which will always overflow.
|
||||
cdef intp_t* p = NULL
|
||||
safe_realloc(&p, <size_t>(-1) / 2)
|
||||
if p != NULL:
|
||||
free(p)
|
||||
assert False
|
||||
|
||||
|
||||
cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
|
||||
"""Return copied data as 1D numpy array of intp's."""
|
||||
cdef cnp.npy_intp shape[1]
|
||||
shape[0] = <cnp.npy_intp> size
|
||||
return cnp.PyArray_SimpleNewFromData(1, shape, cnp.NPY_INTP, data).copy()
|
||||
|
||||
|
||||
cdef inline intp_t rand_int(intp_t low, intp_t high,
|
||||
uint32_t* random_state) noexcept nogil:
|
||||
"""Generate a random integer in [low; end)."""
|
||||
return low + our_rand_r(random_state) % (high - low)
|
||||
|
||||
|
||||
cdef inline float64_t rand_uniform(float64_t low, float64_t high,
|
||||
uint32_t* random_state) noexcept nogil:
|
||||
"""Generate a random float64_t in [low; high)."""
|
||||
return ((high - low) * <float64_t> our_rand_r(random_state) /
|
||||
<float64_t> RAND_R_MAX) + low
|
||||
|
||||
|
||||
cdef inline float64_t log(float64_t x) noexcept nogil:
|
||||
return ln(x) / ln(2.0)
|
||||
|
||||
# =============================================================================
|
||||
# WeightedPQueue data structure
|
||||
# =============================================================================
|
||||
|
||||
cdef class WeightedPQueue:
|
||||
"""A priority queue class, always sorted in increasing order.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
capacity : intp_t
|
||||
The capacity of the priority queue.
|
||||
|
||||
array_ptr : intp_t
|
||||
The water mark of the priority queue; the priority queue grows from
|
||||
left to right in the array ``array_``. ``array_ptr`` is always
|
||||
less than ``capacity``.
|
||||
|
||||
array_ : WeightedPQueueRecord*
|
||||
The array of priority queue records. The minimum element is on the
|
||||
left at index 0, and the maximum element is on the right at index
|
||||
``array_ptr-1``.
|
||||
"""
|
||||
|
||||
def __cinit__(self, intp_t capacity):
|
||||
self.capacity = capacity
|
||||
self.array_ptr = 0
|
||||
safe_realloc(&self.array_, capacity)
|
||||
|
||||
def __dealloc__(self):
|
||||
free(self.array_)
|
||||
|
||||
cdef int reset(self) except -1 nogil:
|
||||
"""Reset the WeightedPQueue to its state at construction
|
||||
|
||||
Return -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
self.array_ptr = 0
|
||||
# Since safe_realloc can raise MemoryError, use `except -1`
|
||||
safe_realloc(&self.array_, self.capacity)
|
||||
return 0
|
||||
|
||||
cdef bint is_empty(self) noexcept nogil:
|
||||
return self.array_ptr <= 0
|
||||
|
||||
cdef intp_t size(self) noexcept nogil:
|
||||
return self.array_ptr
|
||||
|
||||
cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
|
||||
"""Push record on the array.
|
||||
|
||||
Return -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
cdef intp_t array_ptr = self.array_ptr
|
||||
cdef WeightedPQueueRecord* array = NULL
|
||||
cdef intp_t i
|
||||
|
||||
# Resize if capacity not sufficient
|
||||
if array_ptr >= self.capacity:
|
||||
self.capacity *= 2
|
||||
# Since safe_realloc can raise MemoryError, use `except -1`
|
||||
safe_realloc(&self.array_, self.capacity)
|
||||
|
||||
# Put element as last element of array
|
||||
array = self.array_
|
||||
array[array_ptr].data = data
|
||||
array[array_ptr].weight = weight
|
||||
|
||||
# bubble last element up according until it is sorted
|
||||
# in ascending order
|
||||
i = array_ptr
|
||||
while(i != 0 and array[i].data < array[i-1].data):
|
||||
array[i], array[i-1] = array[i-1], array[i]
|
||||
i -= 1
|
||||
|
||||
# Increase element count
|
||||
self.array_ptr = array_ptr + 1
|
||||
return 0
|
||||
|
||||
cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
|
||||
"""Remove a specific value/weight record from the array.
|
||||
Returns 0 if successful, -1 if record not found."""
|
||||
cdef intp_t array_ptr = self.array_ptr
|
||||
cdef WeightedPQueueRecord* array = self.array_
|
||||
cdef intp_t idx_to_remove = -1
|
||||
cdef intp_t i
|
||||
|
||||
if array_ptr <= 0:
|
||||
return -1
|
||||
|
||||
# find element to remove
|
||||
for i in range(array_ptr):
|
||||
if array[i].data == data and array[i].weight == weight:
|
||||
idx_to_remove = i
|
||||
break
|
||||
|
||||
if idx_to_remove == -1:
|
||||
return -1
|
||||
|
||||
# shift the elements after the removed element
|
||||
# to the left.
|
||||
for i in range(idx_to_remove, array_ptr-1):
|
||||
array[i] = array[i+1]
|
||||
|
||||
self.array_ptr = array_ptr - 1
|
||||
return 0
|
||||
|
||||
cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
|
||||
"""Remove the top (minimum) element from array.
|
||||
Returns 0 if successful, -1 if nothing to remove."""
|
||||
cdef intp_t array_ptr = self.array_ptr
|
||||
cdef WeightedPQueueRecord* array = self.array_
|
||||
cdef intp_t i
|
||||
|
||||
if array_ptr <= 0:
|
||||
return -1
|
||||
|
||||
data[0] = array[0].data
|
||||
weight[0] = array[0].weight
|
||||
|
||||
# shift the elements after the removed element
|
||||
# to the left.
|
||||
for i in range(0, array_ptr-1):
|
||||
array[i] = array[i+1]
|
||||
|
||||
self.array_ptr = array_ptr - 1
|
||||
return 0
|
||||
|
||||
cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil:
|
||||
"""Write the top element from array to a pointer.
|
||||
Returns 0 if successful, -1 if nothing to write."""
|
||||
cdef WeightedPQueueRecord* array = self.array_
|
||||
if self.array_ptr <= 0:
|
||||
return -1
|
||||
# Take first value
|
||||
data[0] = array[0].data
|
||||
weight[0] = array[0].weight
|
||||
return 0
|
||||
|
||||
cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil:
|
||||
"""Given an index between [0,self.current_capacity], access
|
||||
the appropriate heap and return the requested weight"""
|
||||
cdef WeightedPQueueRecord* array = self.array_
|
||||
|
||||
# get weight at index
|
||||
return array[index].weight
|
||||
|
||||
cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil:
|
||||
"""Given an index between [0,self.current_capacity], access
|
||||
the appropriate heap and return the requested value"""
|
||||
cdef WeightedPQueueRecord* array = self.array_
|
||||
|
||||
# get value at index
|
||||
return array[index].data
|
||||
|
||||
# =============================================================================
|
||||
# WeightedMedianCalculator data structure
|
||||
# =============================================================================
|
||||
|
||||
cdef class WeightedMedianCalculator:
|
||||
"""A class to handle calculation of the weighted median from streams of
|
||||
data. To do so, it maintains a parameter ``k`` such that the sum of the
|
||||
weights in the range [0,k) is greater than or equal to half of the total
|
||||
weight. By minimizing the value of ``k`` that fulfills this constraint,
|
||||
calculating the median is done by either taking the value of the sample
|
||||
at index ``k-1`` of ``samples`` (samples[k-1].data) or the average of
|
||||
the samples at index ``k-1`` and ``k`` of ``samples``
|
||||
((samples[k-1] + samples[k]) / 2).
|
||||
|
||||
Attributes
|
||||
----------
|
||||
initial_capacity : intp_t
|
||||
The initial capacity of the WeightedMedianCalculator.
|
||||
|
||||
samples : WeightedPQueue
|
||||
Holds the samples (consisting of values and their weights) used in the
|
||||
weighted median calculation.
|
||||
|
||||
total_weight : float64_t
|
||||
The sum of the weights of items in ``samples``. Represents the total
|
||||
weight of all samples used in the median calculation.
|
||||
|
||||
k : intp_t
|
||||
Index used to calculate the median.
|
||||
|
||||
sum_w_0_k : float64_t
|
||||
The sum of the weights from samples[0:k]. Used in the weighted
|
||||
median calculation; minimizing the value of ``k`` such that
|
||||
``sum_w_0_k`` >= ``total_weight / 2`` provides a mechanism for
|
||||
calculating the median in constant time.
|
||||
|
||||
"""
|
||||
|
||||
def __cinit__(self, intp_t initial_capacity):
|
||||
self.initial_capacity = initial_capacity
|
||||
self.samples = WeightedPQueue(initial_capacity)
|
||||
self.total_weight = 0
|
||||
self.k = 0
|
||||
self.sum_w_0_k = 0
|
||||
|
||||
cdef intp_t size(self) noexcept nogil:
|
||||
"""Return the number of samples in the
|
||||
WeightedMedianCalculator"""
|
||||
return self.samples.size()
|
||||
|
||||
cdef int reset(self) except -1 nogil:
|
||||
"""Reset the WeightedMedianCalculator to its state at construction
|
||||
|
||||
Return -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
# samples.reset (WeightedPQueue.reset) uses safe_realloc, hence
|
||||
# except -1
|
||||
self.samples.reset()
|
||||
self.total_weight = 0
|
||||
self.k = 0
|
||||
self.sum_w_0_k = 0
|
||||
return 0
|
||||
|
||||
cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
|
||||
"""Push a value and its associated weight to the WeightedMedianCalculator
|
||||
|
||||
Return -1 in case of failure to allocate memory (and raise MemoryError)
|
||||
or 0 otherwise.
|
||||
"""
|
||||
cdef int return_value
|
||||
cdef float64_t original_median = 0.0
|
||||
|
||||
if self.size() != 0:
|
||||
original_median = self.get_median()
|
||||
# samples.push (WeightedPQueue.push) uses safe_realloc, hence except -1
|
||||
return_value = self.samples.push(data, weight)
|
||||
self.update_median_parameters_post_push(data, weight,
|
||||
original_median)
|
||||
return return_value
|
||||
|
||||
cdef int update_median_parameters_post_push(
|
||||
self, float64_t data, float64_t weight,
|
||||
float64_t original_median) noexcept nogil:
|
||||
"""Update the parameters used in the median calculation,
|
||||
namely `k` and `sum_w_0_k` after an insertion"""
|
||||
|
||||
# trivial case of one element.
|
||||
if self.size() == 1:
|
||||
self.k = 1
|
||||
self.total_weight = weight
|
||||
self.sum_w_0_k = self.total_weight
|
||||
return 0
|
||||
|
||||
# get the original weighted median
|
||||
self.total_weight += weight
|
||||
|
||||
if data < original_median:
|
||||
# inserting below the median, so increment k and
|
||||
# then update self.sum_w_0_k accordingly by adding
|
||||
# the weight that was added.
|
||||
self.k += 1
|
||||
# update sum_w_0_k by adding the weight added
|
||||
self.sum_w_0_k += weight
|
||||
|
||||
# minimize k such that sum(W[0:k]) >= total_weight / 2
|
||||
# minimum value of k is 1
|
||||
while(self.k > 1 and ((self.sum_w_0_k -
|
||||
self.samples.get_weight_from_index(self.k-1))
|
||||
>= self.total_weight / 2.0)):
|
||||
self.k -= 1
|
||||
self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
|
||||
return 0
|
||||
|
||||
if data >= original_median:
|
||||
# inserting above or at the median
|
||||
# minimize k such that sum(W[0:k]) >= total_weight / 2
|
||||
while(self.k < self.samples.size() and
|
||||
(self.sum_w_0_k < self.total_weight / 2.0)):
|
||||
self.k += 1
|
||||
self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
|
||||
return 0
|
||||
|
||||
cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
|
||||
"""Remove a value from the MedianHeap, removing it
|
||||
from consideration in the median calculation
|
||||
"""
|
||||
cdef int return_value
|
||||
cdef float64_t original_median = 0.0
|
||||
|
||||
if self.size() != 0:
|
||||
original_median = self.get_median()
|
||||
|
||||
return_value = self.samples.remove(data, weight)
|
||||
self.update_median_parameters_post_remove(data, weight,
|
||||
original_median)
|
||||
return return_value
|
||||
|
||||
cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
|
||||
"""Pop a value from the MedianHeap, starting from the
|
||||
left and moving to the right.
|
||||
"""
|
||||
cdef int return_value
|
||||
cdef float64_t original_median = 0.0
|
||||
|
||||
if self.size() != 0:
|
||||
original_median = self.get_median()
|
||||
|
||||
# no elements to pop
|
||||
if self.samples.size() == 0:
|
||||
return -1
|
||||
|
||||
return_value = self.samples.pop(data, weight)
|
||||
self.update_median_parameters_post_remove(data[0],
|
||||
weight[0],
|
||||
original_median)
|
||||
return return_value
|
||||
|
||||
cdef int update_median_parameters_post_remove(
|
||||
self, float64_t data, float64_t weight,
|
||||
float64_t original_median) noexcept nogil:
|
||||
"""Update the parameters used in the median calculation,
|
||||
namely `k` and `sum_w_0_k` after a removal"""
|
||||
# reset parameters because it there are no elements
|
||||
if self.samples.size() == 0:
|
||||
self.k = 0
|
||||
self.total_weight = 0
|
||||
self.sum_w_0_k = 0
|
||||
return 0
|
||||
|
||||
# trivial case of one element.
|
||||
if self.samples.size() == 1:
|
||||
self.k = 1
|
||||
self.total_weight -= weight
|
||||
self.sum_w_0_k = self.total_weight
|
||||
return 0
|
||||
|
||||
# get the current weighted median
|
||||
self.total_weight -= weight
|
||||
|
||||
if data < original_median:
|
||||
# removing below the median, so decrement k and
|
||||
# then update self.sum_w_0_k accordingly by subtracting
|
||||
# the removed weight
|
||||
|
||||
self.k -= 1
|
||||
# update sum_w_0_k by removing the weight at index k
|
||||
self.sum_w_0_k -= weight
|
||||
|
||||
# minimize k such that sum(W[0:k]) >= total_weight / 2
|
||||
# by incrementing k and updating sum_w_0_k accordingly
|
||||
# until the condition is met.
|
||||
while(self.k < self.samples.size() and
|
||||
(self.sum_w_0_k < self.total_weight / 2.0)):
|
||||
self.k += 1
|
||||
self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
|
||||
return 0
|
||||
|
||||
if data >= original_median:
|
||||
# removing above the median
|
||||
# minimize k such that sum(W[0:k]) >= total_weight / 2
|
||||
while(self.k > 1 and ((self.sum_w_0_k -
|
||||
self.samples.get_weight_from_index(self.k-1))
|
||||
>= self.total_weight / 2.0)):
|
||||
self.k -= 1
|
||||
self.sum_w_0_k -= self.samples.get_weight_from_index(self.k)
|
||||
return 0
|
||||
|
||||
cdef float64_t get_median(self) noexcept nogil:
|
||||
"""Write the median to a pointer, taking into account
|
||||
sample weights."""
|
||||
if self.sum_w_0_k == (self.total_weight / 2.0):
|
||||
# split median
|
||||
return (self.samples.get_value_from_index(self.k) +
|
||||
self.samples.get_value_from_index(self.k-1)) / 2.0
|
||||
if self.sum_w_0_k > (self.total_weight / 2.0):
|
||||
# whole median
|
||||
return self.samples.get_value_from_index(self.k-1)
|
||||
|
||||
|
||||
def _any_isnan_axis0(const float32_t[:, :] X):
|
||||
"""Same as np.any(np.isnan(X), axis=0)"""
|
||||
cdef:
|
||||
intp_t i, j
|
||||
intp_t n_samples = X.shape[0]
|
||||
intp_t n_features = X.shape[1]
|
||||
unsigned char[::1] isnan_out = np.zeros(X.shape[1], dtype=np.bool_)
|
||||
|
||||
with nogil:
|
||||
for i in range(n_samples):
|
||||
for j in range(n_features):
|
||||
if isnan_out[j]:
|
||||
continue
|
||||
if isnan(X[i, j]):
|
||||
isnan_out[j] = True
|
||||
break
|
||||
return np.asarray(isnan_out)
|
||||
Reference in New Issue
Block a user