venv
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,748 @@
|
||||
from collections.abc import Generator
|
||||
from contextlib import contextmanager
|
||||
import re
|
||||
import struct
|
||||
import tracemalloc
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import hashtable as ht
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.core.algorithms import isin
|
||||
|
||||
|
||||
@contextmanager
|
||||
def activated_tracemalloc() -> Generator[None, None, None]:
|
||||
tracemalloc.start()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
tracemalloc.stop()
|
||||
|
||||
|
||||
def get_allocated_khash_memory():
|
||||
snapshot = tracemalloc.take_snapshot()
|
||||
snapshot = snapshot.filter_traces(
|
||||
(tracemalloc.DomainFilter(True, ht.get_hashtable_trace_domain()),)
|
||||
)
|
||||
return sum(x.size for x in snapshot.traces)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"table_type, dtype",
|
||||
[
|
||||
(ht.PyObjectHashTable, np.object_),
|
||||
(ht.Complex128HashTable, np.complex128),
|
||||
(ht.Int64HashTable, np.int64),
|
||||
(ht.UInt64HashTable, np.uint64),
|
||||
(ht.Float64HashTable, np.float64),
|
||||
(ht.Complex64HashTable, np.complex64),
|
||||
(ht.Int32HashTable, np.int32),
|
||||
(ht.UInt32HashTable, np.uint32),
|
||||
(ht.Float32HashTable, np.float32),
|
||||
(ht.Int16HashTable, np.int16),
|
||||
(ht.UInt16HashTable, np.uint16),
|
||||
(ht.Int8HashTable, np.int8),
|
||||
(ht.UInt8HashTable, np.uint8),
|
||||
(ht.IntpHashTable, np.intp),
|
||||
],
|
||||
)
|
||||
class TestHashTable:
|
||||
def test_get_set_contains_len(self, table_type, dtype):
|
||||
index = 5
|
||||
table = table_type(55)
|
||||
assert len(table) == 0
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
|
||||
table.set_item(index + 1, 41)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 42
|
||||
assert table.get_item(index + 1) == 41
|
||||
|
||||
table.set_item(index, 21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 21
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert index + 2 not in table
|
||||
|
||||
table.set_item(index + 1, 21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 2
|
||||
assert table.get_item(index) == 21
|
||||
assert table.get_item(index + 1) == 21
|
||||
|
||||
with pytest.raises(KeyError, match=str(index + 2)):
|
||||
table.get_item(index + 2)
|
||||
|
||||
def test_get_set_contains_len_mask(self, table_type, dtype):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
index = 5
|
||||
table = table_type(55, uses_mask=True)
|
||||
assert len(table) == 0
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
with pytest.raises(KeyError, match="NA"):
|
||||
table.get_na()
|
||||
|
||||
table.set_item(index + 1, 41)
|
||||
table.set_na(41)
|
||||
assert pd.NA in table
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 3
|
||||
assert table.get_item(index) == 42
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert table.get_na() == 41
|
||||
|
||||
table.set_na(21)
|
||||
assert index in table
|
||||
assert index + 1 in table
|
||||
assert len(table) == 3
|
||||
assert table.get_item(index + 1) == 41
|
||||
assert table.get_na() == 21
|
||||
assert index + 2 not in table
|
||||
|
||||
with pytest.raises(KeyError, match=str(index + 2)):
|
||||
table.get_item(index + 2)
|
||||
|
||||
def test_map_keys_to_values(self, table_type, dtype, writable):
|
||||
# only Int64HashTable has this method
|
||||
if table_type == ht.Int64HashTable:
|
||||
N = 77
|
||||
table = table_type()
|
||||
keys = np.arange(N).astype(dtype)
|
||||
vals = np.arange(N).astype(np.int64) + N
|
||||
keys.flags.writeable = writable
|
||||
vals.flags.writeable = writable
|
||||
table.map_keys_to_values(keys, vals)
|
||||
for i in range(N):
|
||||
assert table.get_item(keys[i]) == i + N
|
||||
|
||||
def test_map_locations(self, table_type, dtype, writable):
|
||||
N = 8
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys)
|
||||
for i in range(N):
|
||||
assert table.get_item(keys[i]) == i
|
||||
|
||||
def test_map_locations_mask(self, table_type, dtype, writable):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
N = 3
|
||||
table = table_type(uses_mask=True)
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys, np.array([False, False, True]))
|
||||
for i in range(N - 1):
|
||||
assert table.get_item(keys[i]) == i
|
||||
|
||||
with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))):
|
||||
table.get_item(keys[N - 1])
|
||||
|
||||
assert table.get_na() == 2
|
||||
|
||||
def test_lookup(self, table_type, dtype, writable):
|
||||
N = 3
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys)
|
||||
result = table.lookup(keys)
|
||||
expected = np.arange(N)
|
||||
tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
|
||||
|
||||
def test_lookup_wrong(self, table_type, dtype):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 100
|
||||
else:
|
||||
N = 512
|
||||
table = table_type()
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
table.map_locations(keys)
|
||||
wrong_keys = np.arange(N).astype(dtype)
|
||||
result = table.lookup(wrong_keys)
|
||||
assert np.all(result == -1)
|
||||
|
||||
def test_lookup_mask(self, table_type, dtype, writable):
|
||||
if table_type == ht.PyObjectHashTable:
|
||||
pytest.skip("Mask not supported for object")
|
||||
N = 3
|
||||
table = table_type(uses_mask=True)
|
||||
keys = (np.arange(N) + N).astype(dtype)
|
||||
mask = np.array([False, True, False])
|
||||
keys.flags.writeable = writable
|
||||
table.map_locations(keys, mask)
|
||||
result = table.lookup(keys, mask)
|
||||
expected = np.arange(N)
|
||||
tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
|
||||
|
||||
result = table.lookup(np.array([1 + N]).astype(dtype), np.array([False]))
|
||||
tm.assert_numpy_array_equal(
|
||||
result.astype(np.int64), np.array([-1], dtype=np.int64)
|
||||
)
|
||||
|
||||
def test_unique(self, table_type, dtype, writable):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 88
|
||||
else:
|
||||
N = 1000
|
||||
table = table_type()
|
||||
expected = (np.arange(N) + N).astype(dtype)
|
||||
keys = np.repeat(expected, 5)
|
||||
keys.flags.writeable = writable
|
||||
unique = table.unique(keys)
|
||||
tm.assert_numpy_array_equal(unique, expected)
|
||||
|
||||
def test_tracemalloc_works(self, table_type, dtype):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 256
|
||||
else:
|
||||
N = 30000
|
||||
keys = np.arange(N).astype(dtype)
|
||||
with activated_tracemalloc():
|
||||
table = table_type()
|
||||
table.map_locations(keys)
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
def test_tracemalloc_for_empty(self, table_type, dtype):
|
||||
with activated_tracemalloc():
|
||||
table = table_type()
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
def test_get_state(self, table_type, dtype):
|
||||
table = table_type(1000)
|
||||
state = table.get_state()
|
||||
assert state["size"] == 0
|
||||
assert state["n_occupied"] == 0
|
||||
assert "n_buckets" in state
|
||||
assert "upper_bound" in state
|
||||
|
||||
@pytest.mark.parametrize("N", range(1, 110))
|
||||
def test_no_reallocation(self, table_type, dtype, N):
|
||||
keys = np.arange(N).astype(dtype)
|
||||
preallocated_table = table_type(N)
|
||||
n_buckets_start = preallocated_table.get_state()["n_buckets"]
|
||||
preallocated_table.map_locations(keys)
|
||||
n_buckets_end = preallocated_table.get_state()["n_buckets"]
|
||||
# original number of buckets was enough:
|
||||
assert n_buckets_start == n_buckets_end
|
||||
# check with clean table (not too much preallocated)
|
||||
clean_table = table_type()
|
||||
clean_table.map_locations(keys)
|
||||
assert n_buckets_start == clean_table.get_state()["n_buckets"]
|
||||
|
||||
|
||||
class TestHashTableUnsorted:
|
||||
# TODO: moved from test_algos; may be redundancies with other tests
|
||||
def test_string_hashtable_set_item_signature(self):
|
||||
# GH#30419 fix typing in StringHashTable.set_item to prevent segfault
|
||||
tbl = ht.StringHashTable()
|
||||
|
||||
tbl.set_item("key", 1)
|
||||
assert tbl.get_item("key") == 1
|
||||
|
||||
with pytest.raises(TypeError, match="'key' has incorrect type"):
|
||||
# key arg typed as string, not object
|
||||
tbl.set_item(4, 6)
|
||||
with pytest.raises(TypeError, match="'val' has incorrect type"):
|
||||
tbl.get_item(4)
|
||||
|
||||
def test_lookup_nan(self, writable):
|
||||
# GH#21688 ensure we can deal with readonly memory views
|
||||
xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
|
||||
xs.setflags(write=writable)
|
||||
m = ht.Float64HashTable()
|
||||
m.map_locations(xs)
|
||||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
|
||||
|
||||
def test_add_signed_zeros(self):
|
||||
# GH#21866 inconsistent hash-function for float64
|
||||
# default hash-function would lead to different hash-buckets
|
||||
# for 0.0 and -0.0 if there are more than 2^30 hash-buckets
|
||||
# but this would mean 16GB
|
||||
N = 4 # 12 * 10**8 would trigger the error, if you have enough memory
|
||||
m = ht.Float64HashTable(N)
|
||||
m.set_item(0.0, 0)
|
||||
m.set_item(-0.0, 0)
|
||||
assert len(m) == 1 # 0.0 and -0.0 are equivalent
|
||||
|
||||
def test_add_different_nans(self):
|
||||
# GH#21866 inconsistent hash-function for float64
|
||||
# create different nans from bit-patterns:
|
||||
NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
|
||||
NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
|
||||
assert NAN1 != NAN1
|
||||
assert NAN2 != NAN2
|
||||
# default hash function would lead to different hash-buckets
|
||||
# for NAN1 and NAN2 even if there are only 4 buckets:
|
||||
m = ht.Float64HashTable()
|
||||
m.set_item(NAN1, 0)
|
||||
m.set_item(NAN2, 0)
|
||||
assert len(m) == 1 # NAN1 and NAN2 are equivalent
|
||||
|
||||
def test_lookup_overflow(self, writable):
|
||||
xs = np.array([1, 2, 2**63], dtype=np.uint64)
|
||||
# GH 21688 ensure we can deal with readonly memory views
|
||||
xs.setflags(write=writable)
|
||||
m = ht.UInt64HashTable()
|
||||
m.map_locations(xs)
|
||||
tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
|
||||
|
||||
@pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case
|
||||
@pytest.mark.parametrize(
|
||||
"htable, uniques, dtype, safely_resizes",
|
||||
[
|
||||
(ht.PyObjectHashTable, ht.ObjectVector, "object", False),
|
||||
(ht.StringHashTable, ht.ObjectVector, "object", True),
|
||||
(ht.Float64HashTable, ht.Float64Vector, "float64", False),
|
||||
(ht.Int64HashTable, ht.Int64Vector, "int64", False),
|
||||
(ht.Int32HashTable, ht.Int32Vector, "int32", False),
|
||||
(ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
|
||||
],
|
||||
)
|
||||
def test_vector_resize(
|
||||
self, writable, htable, uniques, dtype, safely_resizes, nvals
|
||||
):
|
||||
# Test for memory errors after internal vector
|
||||
# reallocations (GH 7157)
|
||||
# Changed from using np.random.default_rng(2).rand to range
|
||||
# which could cause flaky CI failures when safely_resizes=False
|
||||
vals = np.array(range(1000), dtype=dtype)
|
||||
|
||||
# GH 21688 ensures we can deal with read-only memory views
|
||||
vals.setflags(write=writable)
|
||||
|
||||
# initialise instances; cannot initialise in parametrization,
|
||||
# as otherwise external views would be held on the array (which is
|
||||
# one of the things this test is checking)
|
||||
htable = htable()
|
||||
uniques = uniques()
|
||||
|
||||
# get_labels may append to uniques
|
||||
htable.get_labels(vals[:nvals], uniques, 0, -1)
|
||||
# to_array() sets an external_view_exists flag on uniques.
|
||||
tmp = uniques.to_array()
|
||||
oldshape = tmp.shape
|
||||
|
||||
# subsequent get_labels() calls can no longer append to it
|
||||
# (except for StringHashTables + ObjectVector)
|
||||
if safely_resizes:
|
||||
htable.get_labels(vals, uniques, 0, -1)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="external reference.*"):
|
||||
htable.get_labels(vals, uniques, 0, -1)
|
||||
|
||||
uniques.to_array() # should not raise here
|
||||
assert tmp.shape == oldshape
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hashtable",
|
||||
[
|
||||
ht.PyObjectHashTable,
|
||||
ht.StringHashTable,
|
||||
ht.Float64HashTable,
|
||||
ht.Int64HashTable,
|
||||
ht.Int32HashTable,
|
||||
ht.UInt64HashTable,
|
||||
],
|
||||
)
|
||||
def test_hashtable_large_sizehint(self, hashtable):
|
||||
# GH#22729 smoketest for not raising when passing a large size_hint
|
||||
size_hint = np.iinfo(np.uint32).max + 1
|
||||
hashtable(size_hint=size_hint)
|
||||
|
||||
|
||||
class TestPyObjectHashTableWithNans:
|
||||
def test_nan_float(self):
|
||||
nan1 = float("nan")
|
||||
nan2 = float("nan")
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_complex_both(self):
|
||||
nan1 = complex(float("nan"), float("nan"))
|
||||
nan2 = complex(float("nan"), float("nan"))
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_complex_real(self):
|
||||
nan1 = complex(float("nan"), 1)
|
||||
nan2 = complex(float("nan"), 1)
|
||||
other = complex(float("nan"), 2)
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
def test_nan_complex_imag(self):
|
||||
nan1 = complex(1, float("nan"))
|
||||
nan2 = complex(1, float("nan"))
|
||||
other = complex(2, float("nan"))
|
||||
assert nan1 is not nan2
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
def test_nan_in_tuple(self):
|
||||
nan1 = (float("nan"),)
|
||||
nan2 = (float("nan"),)
|
||||
assert nan1[0] is not nan2[0]
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
|
||||
def test_nan_in_nested_tuple(self):
|
||||
nan1 = (1, (2, (float("nan"),)))
|
||||
nan2 = (1, (2, (float("nan"),)))
|
||||
other = (1, 2)
|
||||
table = ht.PyObjectHashTable()
|
||||
table.set_item(nan1, 42)
|
||||
assert table.get_item(nan2) == 42
|
||||
with pytest.raises(KeyError, match=None) as error:
|
||||
table.get_item(other)
|
||||
assert str(error.value) == str(other)
|
||||
|
||||
|
||||
def test_hash_equal_tuple_with_nans():
|
||||
a = (float("nan"), (float("nan"), float("nan")))
|
||||
b = (float("nan"), (float("nan"), float("nan")))
|
||||
assert ht.object_hash(a) == ht.object_hash(b)
|
||||
assert ht.objects_are_equal(a, b)
|
||||
|
||||
|
||||
def test_get_labels_groupby_for_Int64(writable):
|
||||
table = ht.Int64HashTable()
|
||||
vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
|
||||
vals.flags.writeable = writable
|
||||
arr, unique = table.get_labels_groupby(vals)
|
||||
expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp)
|
||||
expected_unique = np.array([1, 2], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(arr, expected_arr)
|
||||
tm.assert_numpy_array_equal(unique, expected_unique)
|
||||
|
||||
|
||||
def test_tracemalloc_works_for_StringHashTable():
|
||||
N = 1000
|
||||
keys = np.arange(N).astype(np.str_).astype(np.object_)
|
||||
with activated_tracemalloc():
|
||||
table = ht.StringHashTable()
|
||||
table.map_locations(keys)
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
|
||||
def test_tracemalloc_for_empty_StringHashTable():
|
||||
with activated_tracemalloc():
|
||||
table = ht.StringHashTable()
|
||||
used = get_allocated_khash_memory()
|
||||
my_size = table.sizeof()
|
||||
assert used == my_size
|
||||
del table
|
||||
assert get_allocated_khash_memory() == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("N", range(1, 110))
|
||||
def test_no_reallocation_StringHashTable(N):
|
||||
keys = np.arange(N).astype(np.str_).astype(np.object_)
|
||||
preallocated_table = ht.StringHashTable(N)
|
||||
n_buckets_start = preallocated_table.get_state()["n_buckets"]
|
||||
preallocated_table.map_locations(keys)
|
||||
n_buckets_end = preallocated_table.get_state()["n_buckets"]
|
||||
# original number of buckets was enough:
|
||||
assert n_buckets_start == n_buckets_end
|
||||
# check with clean table (not too much preallocated)
|
||||
clean_table = ht.StringHashTable()
|
||||
clean_table.map_locations(keys)
|
||||
assert n_buckets_start == clean_table.get_state()["n_buckets"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"table_type, dtype",
|
||||
[
|
||||
(ht.Float64HashTable, np.float64),
|
||||
(ht.Float32HashTable, np.float32),
|
||||
(ht.Complex128HashTable, np.complex128),
|
||||
(ht.Complex64HashTable, np.complex64),
|
||||
],
|
||||
)
|
||||
class TestHashTableWithNans:
|
||||
def test_get_set_contains_len(self, table_type, dtype):
|
||||
index = float("nan")
|
||||
table = table_type()
|
||||
assert index not in table
|
||||
|
||||
table.set_item(index, 42)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 42
|
||||
|
||||
table.set_item(index, 41)
|
||||
assert len(table) == 1
|
||||
assert index in table
|
||||
assert table.get_item(index) == 41
|
||||
|
||||
def test_map_locations(self, table_type, dtype):
|
||||
N = 10
|
||||
table = table_type()
|
||||
keys = np.full(N, np.nan, dtype=dtype)
|
||||
table.map_locations(keys)
|
||||
assert len(table) == 1
|
||||
assert table.get_item(np.nan) == N - 1
|
||||
|
||||
def test_unique(self, table_type, dtype):
|
||||
N = 1020
|
||||
table = table_type()
|
||||
keys = np.full(N, np.nan, dtype=dtype)
|
||||
unique = table.unique(keys)
|
||||
assert np.all(np.isnan(unique)) and len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_floats():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array([float("nan") for i in range(50)], dtype=np.object_)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_complex():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 1
|
||||
|
||||
|
||||
def test_unique_for_nan_objects_tuple():
|
||||
table = ht.PyObjectHashTable()
|
||||
keys = np.array(
|
||||
[1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_
|
||||
)
|
||||
unique = table.unique(keys)
|
||||
assert len(unique) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
np.object_,
|
||||
np.complex128,
|
||||
np.int64,
|
||||
np.uint64,
|
||||
np.float64,
|
||||
np.complex64,
|
||||
np.int32,
|
||||
np.uint32,
|
||||
np.float32,
|
||||
np.int16,
|
||||
np.uint16,
|
||||
np.int8,
|
||||
np.uint8,
|
||||
np.intp,
|
||||
],
|
||||
)
|
||||
class TestHelpFunctions:
|
||||
def test_value_count(self, dtype, writable):
|
||||
N = 43
|
||||
expected = (np.arange(N) + N).astype(dtype)
|
||||
values = np.repeat(expected, 5)
|
||||
values.flags.writeable = writable
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
tm.assert_numpy_array_equal(np.sort(keys), expected)
|
||||
assert np.all(counts == 5)
|
||||
|
||||
def test_value_count_mask(self, dtype):
|
||||
if dtype == np.object_:
|
||||
pytest.skip("mask not implemented for object dtype")
|
||||
values = np.array([1] * 5, dtype=dtype)
|
||||
mask = np.zeros((5,), dtype=np.bool_)
|
||||
mask[1] = True
|
||||
mask[4] = True
|
||||
keys, counts, na_counter = ht.value_count(values, False, mask=mask)
|
||||
assert len(keys) == 2
|
||||
assert na_counter == 2
|
||||
|
||||
def test_value_count_stable(self, dtype, writable):
|
||||
# GH12679
|
||||
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
|
||||
values.flags.writeable = writable
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
tm.assert_numpy_array_equal(keys, values)
|
||||
assert np.all(counts == 1)
|
||||
|
||||
def test_duplicated_first(self, dtype, writable):
|
||||
N = 100
|
||||
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||||
values.flags.writeable = writable
|
||||
result = ht.duplicated(values)
|
||||
expected = np.ones_like(values, dtype=np.bool_)
|
||||
expected[::5] = False
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_yes(self, dtype, writable):
|
||||
N = 127
|
||||
arr = np.arange(N).astype(dtype)
|
||||
values = np.arange(N).astype(dtype)
|
||||
arr.flags.writeable = writable
|
||||
values.flags.writeable = writable
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.ones_like(values, dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_no(self, dtype):
|
||||
N = 17
|
||||
arr = np.arange(N).astype(dtype)
|
||||
values = (np.arange(N) + N).astype(dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.zeros_like(values, dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_mode(self, dtype, writable):
|
||||
if dtype in (np.int8, np.uint8):
|
||||
N = 53
|
||||
else:
|
||||
N = 11111
|
||||
values = np.repeat(np.arange(N).astype(dtype), 5)
|
||||
values[0] = 42
|
||||
values.flags.writeable = writable
|
||||
result = ht.mode(values, False)[0]
|
||||
assert result == 42
|
||||
|
||||
def test_mode_stable(self, dtype, writable):
|
||||
values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
|
||||
values.flags.writeable = writable
|
||||
keys = ht.mode(values, False)[0]
|
||||
tm.assert_numpy_array_equal(keys, values)
|
||||
|
||||
|
||||
def test_modes_with_nans():
|
||||
# GH42688, nans aren't mangled
|
||||
nulls = [pd.NA, np.nan, pd.NaT, None]
|
||||
values = np.array([True] + nulls * 2, dtype=np.object_)
|
||||
modes = ht.mode(values, False)[0]
|
||||
assert modes.size == len(nulls)
|
||||
|
||||
|
||||
def test_unique_label_indices_intp(writable):
|
||||
keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
|
||||
keys.flags.writeable = writable
|
||||
result = ht.unique_label_indices(keys)
|
||||
expected = np.array([0, 1, 5], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_unique_label_indices():
|
||||
a = np.random.default_rng(2).integers(1, 1 << 10, 1 << 15).astype(np.intp)
|
||||
|
||||
left = ht.unique_label_indices(a)
|
||||
right = np.unique(a, return_index=True)[1]
|
||||
|
||||
tm.assert_numpy_array_equal(left, right, check_dtype=False)
|
||||
|
||||
a[np.random.default_rng(2).choice(len(a), 10)] = -1
|
||||
left = ht.unique_label_indices(a)
|
||||
right = np.unique(a, return_index=True)[1][1:]
|
||||
tm.assert_numpy_array_equal(left, right, check_dtype=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
np.float64,
|
||||
np.float32,
|
||||
np.complex128,
|
||||
np.complex64,
|
||||
],
|
||||
)
|
||||
class TestHelpFunctionsWithNans:
|
||||
def test_value_count(self, dtype):
|
||||
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
keys, counts, _ = ht.value_count(values, True)
|
||||
assert len(keys) == 0
|
||||
keys, counts, _ = ht.value_count(values, False)
|
||||
assert len(keys) == 1 and np.all(np.isnan(keys))
|
||||
assert counts[0] == 3
|
||||
|
||||
def test_duplicated_first(self, dtype):
|
||||
values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
result = ht.duplicated(values)
|
||||
expected = np.array([False, True, True])
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_yes(self, dtype):
|
||||
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
values = np.array([np.nan, np.nan], dtype=dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.array([True, True, True], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_ismember_no(self, dtype):
|
||||
arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
|
||||
values = np.array([1], dtype=dtype)
|
||||
result = ht.ismember(arr, values)
|
||||
expected = np.array([False, False, False], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_mode(self, dtype):
|
||||
values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
|
||||
assert ht.mode(values, True)[0] == 42
|
||||
assert np.isnan(ht.mode(values, False)[0])
|
||||
|
||||
|
||||
def test_ismember_tuple_with_nans():
|
||||
# GH-41836
|
||||
values = [("a", float("nan")), ("b", 1)]
|
||||
comps = [("a", float("nan"))]
|
||||
|
||||
msg = "isin with argument that is not not a Series"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = isin(values, comps)
|
||||
expected = np.array([True, False], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_float_complex_int_are_equal_as_objects():
|
||||
values = ["a", 5, 5.0, 5.0 + 0j]
|
||||
comps = list(range(129))
|
||||
result = isin(np.array(values, dtype=object), np.asarray(comps))
|
||||
expected = np.array([False, True, True, True], dtype=np.bool_)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
390
venv/lib/python3.12/site-packages/pandas/tests/libs/test_join.py
Normal file
390
venv/lib/python3.12/site-packages/pandas/tests/libs/test_join.py
Normal file
@ -0,0 +1,390 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import join as libjoin
|
||||
from pandas._libs.join import (
|
||||
inner_join,
|
||||
left_outer_join,
|
||||
)
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestIndexer:
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int32", "int64", "float32", "float64", "object"]
|
||||
)
|
||||
def test_outer_join_indexer(self, dtype):
|
||||
indexer = libjoin.outer_join_indexer
|
||||
|
||||
left = np.arange(3, dtype=dtype)
|
||||
right = np.arange(2, 5, dtype=dtype)
|
||||
empty = np.array([], dtype=dtype)
|
||||
|
||||
result, lindexer, rindexer = indexer(left, right)
|
||||
assert isinstance(result, np.ndarray)
|
||||
assert isinstance(lindexer, np.ndarray)
|
||||
assert isinstance(rindexer, np.ndarray)
|
||||
tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
|
||||
exp = np.array([0, 1, 2, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
result, lindexer, rindexer = indexer(empty, right)
|
||||
tm.assert_numpy_array_equal(result, right)
|
||||
exp = np.array([-1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
result, lindexer, rindexer = indexer(left, empty)
|
||||
tm.assert_numpy_array_equal(result, left)
|
||||
exp = np.array([0, 1, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(lindexer, exp)
|
||||
exp = np.array([-1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(rindexer, exp)
|
||||
|
||||
def test_cython_left_outer_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = left_outer_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
|
||||
exp_ri = np.array(
|
||||
[0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]
|
||||
)
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
|
||||
|
||||
def test_cython_right_outer_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
rs, ls = left_outer_join(right, left, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
# 0 1 1 1
|
||||
exp_li = np.array(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
# 2 2 4
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
-1,
|
||||
]
|
||||
)
|
||||
exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs)
|
||||
|
||||
def test_cython_inner_join(self):
|
||||
left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
|
||||
right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
|
||||
max_group = 5
|
||||
|
||||
ls, rs = inner_join(left, right, max_group)
|
||||
|
||||
exp_ls = left.argsort(kind="mergesort")
|
||||
exp_rs = right.argsort(kind="mergesort")
|
||||
|
||||
exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
|
||||
exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
|
||||
|
||||
exp_ls = exp_ls.take(exp_li)
|
||||
exp_ls[exp_li == -1] = -1
|
||||
|
||||
exp_rs = exp_rs.take(exp_ri)
|
||||
exp_rs[exp_ri == -1] = -1
|
||||
|
||||
tm.assert_numpy_array_equal(ls, exp_ls)
|
||||
tm.assert_numpy_array_equal(rs, exp_rs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("readonly", [True, False])
|
||||
def test_left_join_indexer_unique(readonly):
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
|
||||
if readonly:
|
||||
# GH#37312, GH#37264
|
||||
a.setflags(write=False)
|
||||
b.setflags(write=False)
|
||||
|
||||
result = libjoin.left_join_indexer_unique(b, a)
|
||||
expected = np.array([1, 1, 2, 3, 3], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_left_outer_join_bug():
|
||||
left = np.array(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
1,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
0,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
2,
|
||||
2,
|
||||
0,
|
||||
1,
|
||||
3,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
2,
|
||||
3,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
3,
|
||||
1,
|
||||
2,
|
||||
0,
|
||||
2,
|
||||
],
|
||||
dtype=np.intp,
|
||||
)
|
||||
|
||||
right = np.array([3, 1], dtype=np.intp)
|
||||
max_groups = 4
|
||||
|
||||
lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
|
||||
|
||||
exp_lidx = np.arange(len(left), dtype=np.intp)
|
||||
exp_ridx = -np.ones(len(left), dtype=np.intp)
|
||||
|
||||
exp_ridx[left == 1] = 1
|
||||
exp_ridx[left == 3] = 0
|
||||
|
||||
tm.assert_numpy_array_equal(lidx, exp_lidx)
|
||||
tm.assert_numpy_array_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_inner_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.inner_join_indexer(a, b)
|
||||
|
||||
index_exp = np.array([3, 5], dtype=np.int64)
|
||||
tm.assert_almost_equal(index, index_exp)
|
||||
|
||||
aexp = np.array([2, 4], dtype=np.intp)
|
||||
bexp = np.array([1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.inner_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_outer_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.outer_join_indexer(a, b)
|
||||
|
||||
index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(index, index_exp)
|
||||
|
||||
aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp)
|
||||
bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.outer_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_left_join_indexer():
|
||||
a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
|
||||
b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.left_join_indexer(a, b)
|
||||
|
||||
tm.assert_almost_equal(index, a)
|
||||
|
||||
aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
|
||||
bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(ares, aexp)
|
||||
tm.assert_almost_equal(bres, bexp)
|
||||
|
||||
a = np.array([5], dtype=np.int64)
|
||||
b = np.array([5], dtype=np.int64)
|
||||
|
||||
index, ares, bres = libjoin.left_join_indexer(a, b)
|
||||
tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
|
||||
tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
|
||||
tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
|
||||
|
||||
|
||||
def test_left_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.left_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_outer_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
||||
|
||||
|
||||
def test_inner_join_indexer2():
|
||||
idx = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
|
||||
|
||||
res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx)
|
||||
|
||||
exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
|
||||
tm.assert_almost_equal(res, exp_res)
|
||||
|
||||
exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp)
|
||||
tm.assert_almost_equal(lidx, exp_lidx)
|
||||
|
||||
exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp)
|
||||
tm.assert_almost_equal(ridx, exp_ridx)
|
285
venv/lib/python3.12/site-packages/pandas/tests/libs/test_lib.py
Normal file
285
venv/lib/python3.12/site-packages/pandas/tests/libs/test_lib.py
Normal file
@ -0,0 +1,285 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import (
|
||||
Timedelta,
|
||||
lib,
|
||||
writers as libwriters,
|
||||
)
|
||||
from pandas.compat import IS64
|
||||
|
||||
from pandas import Index
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestMisc:
|
||||
def test_max_len_string_array(self):
|
||||
arr = a = np.array(["foo", "b", np.nan], dtype="object")
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# unicode
|
||||
arr = a.astype("U").astype(object)
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# bytes for python3
|
||||
arr = a.astype("S").astype(object)
|
||||
assert libwriters.max_len_string_array(arr) == 3
|
||||
|
||||
# raises
|
||||
msg = "No matching signature found"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
libwriters.max_len_string_array(arr.astype("U"))
|
||||
|
||||
def test_fast_unique_multiple_list_gen_sort(self):
|
||||
keys = [["p", "a"], ["n", "d"], ["a", "s"]]
|
||||
|
||||
gen = (key for key in keys)
|
||||
expected = np.array(["a", "d", "n", "p", "s"])
|
||||
out = lib.fast_unique_multiple_list_gen(gen, sort=True)
|
||||
tm.assert_numpy_array_equal(np.array(out), expected)
|
||||
|
||||
gen = (key for key in keys)
|
||||
expected = np.array(["p", "a", "n", "d", "s"])
|
||||
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
|
||||
tm.assert_numpy_array_equal(np.array(out), expected)
|
||||
|
||||
def test_fast_multiget_timedelta_resos(self):
|
||||
# This will become relevant for test_constructor_dict_timedelta64_index
|
||||
# once Timedelta constructor preserves reso when passed a
|
||||
# np.timedelta64 object
|
||||
td = Timedelta(days=1)
|
||||
|
||||
mapping1 = {td: 1}
|
||||
mapping2 = {td.as_unit("s"): 1}
|
||||
|
||||
oindex = Index([td * n for n in range(3)])._values.astype(object)
|
||||
|
||||
expected = lib.fast_multiget(mapping1, oindex)
|
||||
result = lib.fast_multiget(mapping2, oindex)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# case that can't be cast to td64ns
|
||||
td = Timedelta(np.timedelta64(146000, "D"))
|
||||
assert hash(td) == hash(td.as_unit("ms"))
|
||||
assert hash(td) == hash(td.as_unit("us"))
|
||||
mapping1 = {td: 1}
|
||||
mapping2 = {td.as_unit("ms"): 1}
|
||||
|
||||
oindex = Index([td * n for n in range(3)])._values.astype(object)
|
||||
|
||||
expected = lib.fast_multiget(mapping1, oindex)
|
||||
result = lib.fast_multiget(mapping2, oindex)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestIndexing:
|
||||
def test_maybe_indices_to_slice_left_edge(self):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.array([], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("end", [1, 2, 5, 20, 99])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4])
|
||||
def test_maybe_indices_to_slice_left_edge_not_slice_end_steps(self, end, step):
|
||||
target = np.arange(100)
|
||||
indices = np.arange(0, end, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_left_edge_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("start", [0, 2, 5, 20, 97, 98])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4])
|
||||
def test_maybe_indices_to_slice_right_edge(self, start, step):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.arange(start, 99, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
def test_maybe_indices_to_slice_right_edge_not_slice(self):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array([97, 98, 99, 100], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
|
||||
msg = "index 100 is out of bounds for axis (0|1) with size 100"
|
||||
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[indices]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[maybe_slice]
|
||||
|
||||
indices = np.array([100, 99, 98, 97], dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[indices]
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
target[maybe_slice]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_right_edge_cases(self, case):
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("step", [1, 2, 4, 5, 8, 9])
|
||||
def test_maybe_indices_to_slice_both_edges(self, step):
|
||||
target = np.arange(10)
|
||||
|
||||
# slice
|
||||
indices = np.arange(0, 9, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("case", [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]])
|
||||
def test_maybe_indices_to_slice_both_edges_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(10)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize("start, end", [(2, 10), (5, 25), (65, 97)])
|
||||
@pytest.mark.parametrize("step", [1, 2, 4, 20])
|
||||
def test_maybe_indices_to_slice_middle(self, start, end, step):
|
||||
target = np.arange(100)
|
||||
|
||||
# slice
|
||||
indices = np.arange(start, end, step, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
# reverse
|
||||
indices = indices[::-1]
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case", [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]
|
||||
)
|
||||
def test_maybe_indices_to_slice_middle_not_slice(self, case):
|
||||
# not slice
|
||||
target = np.arange(100)
|
||||
indices = np.array(case, dtype=np.intp)
|
||||
maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
|
||||
|
||||
assert not isinstance(maybe_slice, slice)
|
||||
tm.assert_numpy_array_equal(maybe_slice, indices)
|
||||
tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
|
||||
|
||||
def test_maybe_booleans_to_slice(self):
|
||||
arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8)
|
||||
result = lib.maybe_booleans_to_slice(arr)
|
||||
assert result.dtype == np.bool_
|
||||
|
||||
result = lib.maybe_booleans_to_slice(arr[:0])
|
||||
assert result == slice(0, 0)
|
||||
|
||||
def test_get_reverse_indexer(self):
|
||||
indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp)
|
||||
result = lib.get_reverse_indexer(indexer, 5)
|
||||
expected = np.array([4, 2, 3, 6, 7], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer(self, dtype):
|
||||
# GH#50592
|
||||
left = np.arange(0, 100, dtype=dtype)
|
||||
assert lib.is_range_indexer(left, 100)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not IS64,
|
||||
reason="2**31 is too big for Py_ssize_t on 32-bit. "
|
||||
"It doesn't matter though since you cannot create an array that long on 32-bit",
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_big_n(self, dtype):
|
||||
# GH53616
|
||||
left = np.arange(0, 100, dtype=dtype)
|
||||
|
||||
assert not lib.is_range_indexer(left, 2**31)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_not_equal(self, dtype):
|
||||
# GH#50592
|
||||
left = np.array([1, 2], dtype=dtype)
|
||||
assert not lib.is_range_indexer(left, 2)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "int32"])
|
||||
def test_is_range_indexer_not_equal_shape(self, dtype):
|
||||
# GH#50592
|
||||
left = np.array([0, 1, 2], dtype=dtype)
|
||||
assert not lib.is_range_indexer(left, 2)
|
||||
|
||||
|
||||
def test_cache_readonly_preserve_docstrings():
|
||||
# GH18197
|
||||
assert Index.hasnans.__doc__ is not None
|
||||
|
||||
|
||||
def test_no_default_pickle():
|
||||
# GH#40397
|
||||
obj = tm.round_trip_pickle(lib.no_default)
|
||||
assert obj is lib.no_default
|
@ -0,0 +1,162 @@
|
||||
from datetime import datetime
|
||||
from itertools import permutations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from pandas._libs import algos as libalgos
|
||||
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_ensure_platform_int():
|
||||
arr = np.arange(100, dtype=np.intp)
|
||||
|
||||
result = libalgos.ensure_platform_int(arr)
|
||||
assert result is arr
|
||||
|
||||
|
||||
def test_is_lexsorted():
|
||||
failure = [
|
||||
np.array(
|
||||
([3] * 32) + ([2] * 32) + ([1] * 32) + ([0] * 32),
|
||||
dtype="int64",
|
||||
),
|
||||
np.array(
|
||||
list(range(31))[::-1] * 4,
|
||||
dtype="int64",
|
||||
),
|
||||
]
|
||||
|
||||
assert not libalgos.is_lexsorted(failure)
|
||||
|
||||
|
||||
def test_groupsort_indexer():
|
||||
a = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
|
||||
b = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
|
||||
|
||||
result = libalgos.groupsort_indexer(a, 1000)[0]
|
||||
|
||||
# need to use a stable sort
|
||||
# np.argsort returns int, groupsort_indexer
|
||||
# always returns intp
|
||||
expected = np.argsort(a, kind="mergesort")
|
||||
expected = expected.astype(np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# compare with lexsort
|
||||
# np.lexsort returns int, groupsort_indexer
|
||||
# always returns intp
|
||||
key = a * 1000 + b
|
||||
result = libalgos.groupsort_indexer(key, 1000000)[0]
|
||||
expected = np.lexsort((b, a))
|
||||
expected = expected.astype(np.intp)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestPadBackfill:
|
||||
def test_backfill(self):
|
||||
old = np.array([1, 5, 10], dtype=np.int64)
|
||||
new = np.array(list(range(12)), dtype=np.int64)
|
||||
|
||||
filler = libalgos.backfill["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
# corner case
|
||||
old = np.array([1, 4], dtype=np.int64)
|
||||
new = np.array(list(range(5, 10)), dtype=np.int64)
|
||||
filler = libalgos.backfill["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
def test_pad(self):
|
||||
old = np.array([1, 5, 10], dtype=np.int64)
|
||||
new = np.array(list(range(12)), dtype=np.int64)
|
||||
|
||||
filler = libalgos.pad["int64_t"](old, new)
|
||||
|
||||
expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
# corner case
|
||||
old = np.array([5, 10], dtype=np.int64)
|
||||
new = np.arange(5, dtype=np.int64)
|
||||
filler = libalgos.pad["int64_t"](old, new)
|
||||
expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(filler, expect_filler)
|
||||
|
||||
def test_pad_backfill_object_segfault(self):
|
||||
old = np.array([], dtype="O")
|
||||
new = np.array([datetime(2010, 12, 31)], dtype="O")
|
||||
|
||||
result = libalgos.pad["object"](old, new)
|
||||
expected = np.array([-1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.pad["object"](new, old)
|
||||
expected = np.array([], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.backfill["object"](old, new)
|
||||
expected = np.array([-1], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = libalgos.backfill["object"](new, old)
|
||||
expected = np.array([], dtype=np.intp)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
class TestInfinity:
|
||||
def test_infinity_sort(self):
|
||||
# GH#13445
|
||||
# numpy's argsort can be unhappy if something is less than
|
||||
# itself. Instead, let's give our infinities a self-consistent
|
||||
# ordering, but outside the float extended real line.
|
||||
|
||||
Inf = libalgos.Infinity()
|
||||
NegInf = libalgos.NegInfinity()
|
||||
|
||||
ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
|
||||
|
||||
assert all(Inf >= x for x in ref_nums)
|
||||
assert all(Inf > x or x is Inf for x in ref_nums)
|
||||
assert Inf >= Inf and Inf == Inf
|
||||
assert not Inf < Inf and not Inf > Inf
|
||||
assert libalgos.Infinity() == libalgos.Infinity()
|
||||
assert not libalgos.Infinity() != libalgos.Infinity()
|
||||
|
||||
assert all(NegInf <= x for x in ref_nums)
|
||||
assert all(NegInf < x or x is NegInf for x in ref_nums)
|
||||
assert NegInf <= NegInf and NegInf == NegInf
|
||||
assert not NegInf < NegInf and not NegInf > NegInf
|
||||
assert libalgos.NegInfinity() == libalgos.NegInfinity()
|
||||
assert not libalgos.NegInfinity() != libalgos.NegInfinity()
|
||||
|
||||
for perm in permutations(ref_nums):
|
||||
assert sorted(perm) == ref_nums
|
||||
|
||||
# smoke tests
|
||||
np.array([libalgos.Infinity()] * 32).argsort()
|
||||
np.array([libalgos.NegInfinity()] * 32).argsort()
|
||||
|
||||
def test_infinity_against_nan(self):
|
||||
Inf = libalgos.Infinity()
|
||||
NegInf = libalgos.NegInfinity()
|
||||
|
||||
assert not Inf > np.nan
|
||||
assert not Inf >= np.nan
|
||||
assert not Inf < np.nan
|
||||
assert not Inf <= np.nan
|
||||
assert not Inf == np.nan
|
||||
assert Inf != np.nan
|
||||
|
||||
assert not NegInf > np.nan
|
||||
assert not NegInf >= np.nan
|
||||
assert not NegInf < np.nan
|
||||
assert not NegInf <= np.nan
|
||||
assert not NegInf == np.nan
|
||||
assert NegInf != np.nan
|
Reference in New Issue
Block a user