This commit is contained in:
2024-12-04 13:35:57 +05:00
parent d346bf4b2a
commit 73ce681a55
7059 changed files with 1196501 additions and 0 deletions

View File

@ -0,0 +1,253 @@
import string
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSeriesAccessor:
def test_to_dense(self):
ser = pd.Series([0, 1, 0, 10], dtype="Sparse[int64]")
result = ser.sparse.to_dense()
expected = pd.Series([0, 1, 0, 10])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("attr", ["npoints", "density", "fill_value", "sp_values"])
def test_get_attributes(self, attr):
arr = SparseArray([0, 1])
ser = pd.Series(arr)
result = getattr(ser.sparse, attr)
expected = getattr(arr, attr)
assert result == expected
def test_from_coo(self):
scipy_sparse = pytest.importorskip("scipy.sparse")
row = [0, 3, 1, 0]
col = [0, 3, 1, 2]
data = [4, 5, 7, 9]
sp_array = scipy_sparse.coo_matrix((data, (row, col)))
result = pd.Series.sparse.from_coo(sp_array)
index = pd.MultiIndex.from_arrays(
[
np.array([0, 0, 1, 3], dtype=np.int32),
np.array([0, 2, 1, 3], dtype=np.int32),
],
)
expected = pd.Series([4, 9, 7, 5], index=index, dtype="Sparse[int]")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"sort_labels, expected_rows, expected_cols, expected_values_pos",
[
(
False,
[("b", 2), ("a", 2), ("b", 1), ("a", 1)],
[("z", 1), ("z", 2), ("x", 2), ("z", 0)],
{1: (1, 0), 3: (3, 3)},
),
(
True,
[("a", 1), ("a", 2), ("b", 1), ("b", 2)],
[("x", 2), ("z", 0), ("z", 1), ("z", 2)],
{1: (1, 2), 3: (0, 1)},
),
],
)
def test_to_coo(
self, sort_labels, expected_rows, expected_cols, expected_values_pos
):
sp_sparse = pytest.importorskip("scipy.sparse")
values = SparseArray([0, np.nan, 1, 0, None, 3], fill_value=0)
index = pd.MultiIndex.from_tuples(
[
("b", 2, "z", 1),
("a", 2, "z", 2),
("a", 2, "z", 1),
("a", 2, "x", 2),
("b", 1, "z", 1),
("a", 1, "z", 0),
]
)
ss = pd.Series(values, index=index)
expected_A = np.zeros((4, 4))
for value, (row, col) in expected_values_pos.items():
expected_A[row, col] = value
A, rows, cols = ss.sparse.to_coo(
row_levels=(0, 1), column_levels=(2, 3), sort_labels=sort_labels
)
assert isinstance(A, sp_sparse.coo_matrix)
tm.assert_numpy_array_equal(A.toarray(), expected_A)
assert rows == expected_rows
assert cols == expected_cols
def test_non_sparse_raises(self):
ser = pd.Series([1, 2, 3])
with pytest.raises(AttributeError, match=".sparse"):
ser.sparse.density
class TestFrameAccessor:
def test_accessor_raises(self):
df = pd.DataFrame({"A": [0, 1]})
with pytest.raises(AttributeError, match="sparse"):
df.sparse
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
@pytest.mark.parametrize("dtype", ["float64", "int64"])
def test_from_spmatrix(self, format, labels, dtype):
sp_sparse = pytest.importorskip("scipy.sparse")
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
mat = sp_sparse.eye(10, format=format, dtype=dtype)
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
expected = pd.DataFrame(
np.eye(10, dtype=dtype), index=labels, columns=labels
).astype(sp_dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
def test_from_spmatrix_including_explicit_zero(self, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(10, 2, density=0.5, format=format)
mat.data[0] = 0
result = pd.DataFrame.sparse.from_spmatrix(mat)
dtype = SparseDtype("float64", 0.0)
expected = pd.DataFrame(mat.todense()).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"columns",
[["a", "b"], pd.MultiIndex.from_product([["A"], ["a", "b"]]), ["a", "a"]],
)
def test_from_spmatrix_columns(self, columns):
sp_sparse = pytest.importorskip("scipy.sparse")
dtype = SparseDtype("float64", 0.0)
mat = sp_sparse.random(10, 2, density=0.5)
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
)
def test_to_coo(self, colnames):
sp_sparse = pytest.importorskip("scipy.sparse")
df = pd.DataFrame(
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
)
result = df.sparse.to_coo()
expected = sp_sparse.coo_matrix(np.asarray(df))
assert (result != expected).nnz == 0
@pytest.mark.parametrize("fill_value", [1, np.nan])
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
pytest.importorskip("scipy")
df = pd.DataFrame(
{
"A": SparseArray(
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
),
"B": SparseArray(
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
),
}
)
with pytest.raises(ValueError, match="fill value must be 0"):
df.sparse.to_coo()
def test_to_coo_midx_categorical(self):
# GH#50996
sp_sparse = pytest.importorskip("scipy.sparse")
midx = pd.MultiIndex.from_arrays(
[
pd.CategoricalIndex(list("ab"), name="x"),
pd.CategoricalIndex([0, 1], name="y"),
]
)
ser = pd.Series(1, index=midx, dtype="Sparse[int]")
result = ser.sparse.to_coo(row_levels=["x"], column_levels=["y"])[0]
expected = sp_sparse.coo_matrix(
(np.array([1, 1]), (np.array([0, 1]), np.array([0, 1]))), shape=(2, 2)
)
assert (result != expected).nnz == 0
def test_to_dense(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)),
"B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)),
"C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)),
},
index=["b", "a"],
)
result = df.sparse.to_dense()
expected = pd.DataFrame(
{"A": [1, 0], "B": [1, 0], "C": [1.0, 0.0]}, index=["b", "a"]
)
tm.assert_frame_equal(result, expected)
def test_density(self):
df = pd.DataFrame(
{
"A": SparseArray([1, 0, 2, 1], fill_value=0),
"B": SparseArray([0, 1, 1, 1], fill_value=0),
}
)
res = df.sparse.density
expected = 0.75
assert res == expected
@pytest.mark.parametrize("dtype", ["int64", "float64"])
@pytest.mark.parametrize("dense_index", [True, False])
def test_series_from_coo(self, dtype, dense_index):
sp_sparse = pytest.importorskip("scipy.sparse")
A = sp_sparse.eye(3, format="coo", dtype=dtype)
result = pd.Series.sparse.from_coo(A, dense_index=dense_index)
index = pd.MultiIndex.from_tuples(
[
np.array([0, 0], dtype=np.int32),
np.array([1, 1], dtype=np.int32),
np.array([2, 2], dtype=np.int32),
],
)
expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index)
if dense_index:
expected = expected.reindex(pd.MultiIndex.from_product(index.levels))
tm.assert_series_equal(result, expected)
def test_series_from_coo_incorrect_format_raises(self):
# gh-26554
sp_sparse = pytest.importorskip("scipy.sparse")
m = sp_sparse.csr_matrix(np.array([[0, 1], [0, 0]]))
with pytest.raises(
TypeError, match="Expected coo_matrix. Got csr_matrix instead."
):
pd.Series.sparse.from_coo(m)
def test_with_column_named_sparse(self):
# https://github.com/pandas-dev/pandas/issues/30758
df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor)

View File

@ -0,0 +1,514 @@
import operator
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture(params=["integer", "block"])
def kind(request):
"""kind kwarg to pass to SparseArray"""
return request.param
@pytest.fixture(params=[True, False])
def mix(request):
"""
Fixture returning True or False, determining whether to operate
op(sparse, dense) instead of op(sparse, sparse)
"""
return request.param
class TestSparseArrayArithmetics:
def _assert(self, a, b):
# We have to use tm.assert_sp_array_equal. See GH #45126
tm.assert_numpy_array_equal(a, b)
def _check_numeric_ops(self, a, b, a_dense, b_dense, mix: bool, op):
# Check that arithmetic behavior matches non-Sparse Series arithmetic
if isinstance(a_dense, np.ndarray):
expected = op(pd.Series(a_dense), b_dense).values
elif isinstance(b_dense, np.ndarray):
expected = op(a_dense, pd.Series(b_dense)).values
else:
raise NotImplementedError
with np.errstate(invalid="ignore", divide="ignore"):
if mix:
result = op(a, b_dense).to_dense()
else:
result = op(a, b).to_dense()
self._assert(result, expected)
def _check_bool_result(self, res):
assert isinstance(res, SparseArray)
assert isinstance(res.dtype, SparseDtype)
assert res.dtype.subtype == np.bool_
assert isinstance(res.fill_value, bool)
def _check_comparison_ops(self, a, b, a_dense, b_dense):
with np.errstate(invalid="ignore"):
# Unfortunately, trying to wrap the computation of each expected
# value is with np.errstate() is too tedious.
#
# sparse & sparse
self._check_bool_result(a == b)
self._assert((a == b).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b)
self._assert((a != b).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b)
self._assert((a >= b).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b)
self._assert((a <= b).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b)
self._assert((a > b).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b)
self._assert((a < b).to_dense(), a_dense < b_dense)
# sparse & dense
self._check_bool_result(a == b_dense)
self._assert((a == b_dense).to_dense(), a_dense == b_dense)
self._check_bool_result(a != b_dense)
self._assert((a != b_dense).to_dense(), a_dense != b_dense)
self._check_bool_result(a >= b_dense)
self._assert((a >= b_dense).to_dense(), a_dense >= b_dense)
self._check_bool_result(a <= b_dense)
self._assert((a <= b_dense).to_dense(), a_dense <= b_dense)
self._check_bool_result(a > b_dense)
self._assert((a > b_dense).to_dense(), a_dense > b_dense)
self._check_bool_result(a < b_dense)
self._assert((a < b_dense).to_dense(), a_dense < b_dense)
def _check_logical_ops(self, a, b, a_dense, b_dense):
# sparse & sparse
self._check_bool_result(a & b)
self._assert((a & b).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b)
self._assert((a | b).to_dense(), a_dense | b_dense)
# sparse & dense
self._check_bool_result(a & b_dense)
self._assert((a & b_dense).to_dense(), a_dense & b_dense)
self._check_bool_result(a | b_dense)
self._assert((a | b_dense).to_dense(), a_dense | b_dense)
@pytest.mark.parametrize("scalar", [0, 1, 3])
@pytest.mark.parametrize("fill_value", [None, 0, 2])
def test_float_scalar(
self, kind, mix, all_arithmetic_functions, fill_value, scalar, request
):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = SparseArray(values, kind=kind, fill_value=fill_value)
self._check_numeric_ops(a, scalar, values, scalar, mix, op)
def test_float_scalar_comparison(self, kind):
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
a = SparseArray(values, kind=kind)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = SparseArray(values, kind=kind, fill_value=0)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
a = SparseArray(values, kind=kind, fill_value=2)
self._check_comparison_ops(a, 1, values, 1)
self._check_comparison_ops(a, 0, values, 0)
self._check_comparison_ops(a, 3, values, 3)
def test_float_same_index_without_nans(self, kind, mix, all_arithmetic_functions):
# when sp_index are the same
op = all_arithmetic_functions
values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_with_nans(
self, kind, mix, all_arithmetic_functions, request
):
# when sp_index are the same
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_same_index_comparison(self, kind):
# when sp_index are the same
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
values = np.array([0.0, 1.0, 2.0, 6.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0])
rvalues = np.array([0.0, 2.0, 3.0, 4.0, 0.0, 0.0, 1.0, 3.0, 2.0, 0.0])
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
def test_float_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_different_kind(self, mix, all_arithmetic_functions):
op = all_arithmetic_functions
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind="integer")
b = SparseArray(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind="integer", fill_value=0)
b = SparseArray(rvalues, kind="block")
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind="integer", fill_value=0)
b = SparseArray(rvalues, kind="block", fill_value=0)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind="integer", fill_value=1)
b = SparseArray(rvalues, kind="block", fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_float_array_comparison(self, kind):
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan])
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_int_array(self, kind, mix, all_arithmetic_functions):
op = all_arithmetic_functions
# have to specify dtype explicitly until fixing GH 667
dtype = np.int64
values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = SparseArray(values, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, fill_value=0, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype)
b = SparseArray(rvalues, fill_value=0, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, fill_value=1, dtype=dtype, kind=kind)
assert a.dtype == SparseDtype(dtype, fill_value=1)
b = SparseArray(rvalues, fill_value=2, dtype=dtype, kind=kind)
assert b.dtype == SparseDtype(dtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_int_array_comparison(self, kind):
dtype = "int64"
# int32 NI ATM
values = np.array([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype)
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype)
a = SparseArray(values, dtype=dtype, kind=kind)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
b = SparseArray(rvalues, dtype=dtype, kind=kind)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=0)
b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=0)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, dtype=dtype, kind=kind, fill_value=1)
b = SparseArray(rvalues, dtype=dtype, kind=kind, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_same_index(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = np.array([True, False, True, True], dtype=np.bool_)
rvalues = np.array([True, False, True, True], dtype=np.bool_)
a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
@pytest.mark.parametrize("fill_value", [True, False, np.nan])
def test_bool_array_logical(self, kind, fill_value):
# GH 14000
# when sp_index are the same
values = np.array([True, False, True, False, True, True], dtype=np.bool_)
rvalues = np.array([True, False, False, True, False, True], dtype=np.bool_)
a = SparseArray(values, kind=kind, dtype=np.bool_, fill_value=fill_value)
b = SparseArray(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value)
self._check_logical_ops(a, b, values, rvalues)
def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions, request):
op = all_arithmetic_functions
rdtype = "int64"
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
self._check_numeric_ops(a, b * 0, values, rvalues * 0, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_numeric_ops(a, b, values, rvalues, mix, op)
def test_mixed_array_comparison(self, kind):
rdtype = "int64"
# int32 NI ATM
values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan])
rvalues = np.array([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype)
a = SparseArray(values, kind=kind)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
self._check_comparison_ops(a, b * 0, values, rvalues * 0)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=0)
b = SparseArray(rvalues, kind=kind, fill_value=0)
assert b.dtype == SparseDtype(rdtype)
self._check_comparison_ops(a, b, values, rvalues)
a = SparseArray(values, kind=kind, fill_value=1)
b = SparseArray(rvalues, kind=kind, fill_value=2)
assert b.dtype == SparseDtype(rdtype, fill_value=2)
self._check_comparison_ops(a, b, values, rvalues)
def test_xor(self):
s = SparseArray([True, True, False, False])
t = SparseArray([True, False, True, False])
result = s ^ t
sp_index = pd.core.arrays.sparse.IntIndex(4, np.array([0, 1, 2], dtype="int32"))
expected = SparseArray([False, True, True], sparse_index=sp_index)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("op", [operator.eq, operator.add])
def test_with_list(op):
arr = SparseArray([0, 1], fill_value=0)
result = op(arr, [0, 1])
expected = op(arr, SparseArray([0, 1]))
tm.assert_sp_array_equal(result, expected)
def test_with_dataframe():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
df = pd.DataFrame([[1, 2], [3, 4]])
result = arr.__add__(df)
assert result is NotImplemented
def test_with_zerodim_ndarray():
# GH#27910
arr = SparseArray([0, 1], fill_value=0)
result = arr * np.array(2)
expected = arr * 2
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.abs, np.exp])
@pytest.mark.parametrize(
"arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])]
)
def test_ufuncs(ufunc, arr):
result = ufunc(arr)
fill_value = ufunc(arr.fill_value)
expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value)
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
(SparseArray([0, 0, 0]), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
(SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])),
],
)
@pytest.mark.parametrize("ufunc", [np.add, np.greater])
def test_binary_ufuncs(ufunc, a, b):
# can't say anything about fill value here.
result = ufunc(a, b)
expected = ufunc(np.asarray(a), np.asarray(b))
assert isinstance(result, SparseArray)
tm.assert_numpy_array_equal(np.asarray(result), expected)
def test_ndarray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
ndarray += sparray
expected = np.array([0, 3, 2, 3])
tm.assert_numpy_array_equal(ndarray, expected)
def test_sparray_inplace():
sparray = SparseArray([0, 2, 0, 0])
ndarray = np.array([0, 1, 2, 3])
sparray += ndarray
expected = SparseArray([0, 3, 2, 3], fill_value=0)
tm.assert_sp_array_equal(sparray, expected)
@pytest.mark.parametrize("cons", [list, np.array, SparseArray])
def test_mismatched_length_cmp_op(cons):
left = SparseArray([True, True])
right = cons([True, True, True])
with pytest.raises(ValueError, match="operands have mismatched length"):
left & right
@pytest.mark.parametrize("op", ["add", "sub", "mul", "truediv", "floordiv", "pow"])
@pytest.mark.parametrize("fill_value", [np.nan, 3])
def test_binary_operators(op, fill_value):
op = getattr(operator, op)
data1 = np.random.default_rng(2).standard_normal(20)
data2 = np.random.default_rng(2).standard_normal(20)
data1[::2] = fill_value
data2[::3] = fill_value
first = SparseArray(data1, fill_value=fill_value)
second = SparseArray(data2, fill_value=fill_value)
with np.errstate(all="ignore"):
res = op(first, second)
exp = SparseArray(
op(first.to_dense(), second.to_dense()), fill_value=first.fill_value
)
assert isinstance(res, SparseArray)
tm.assert_almost_equal(res.to_dense(), exp.to_dense())
res2 = op(first, second.to_dense())
assert isinstance(res2, SparseArray)
tm.assert_sp_array_equal(res, res2)
res3 = op(first.to_dense(), second)
assert isinstance(res3, SparseArray)
tm.assert_sp_array_equal(res, res3)
res4 = op(first, 4)
assert isinstance(res4, SparseArray)
# Ignore this if the actual op raises (e.g. pow).
try:
exp = op(first.to_dense(), 4)
exp_fv = op(first.fill_value, 4)
except ValueError:
pass
else:
tm.assert_almost_equal(res4.fill_value, exp_fv)
tm.assert_almost_equal(res4.to_dense(), exp)

View File

@ -0,0 +1,480 @@
import re
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
import pandas as pd
from pandas import (
SparseDtype,
isna,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture
def arr_data():
"""Fixture returning numpy array with valid and missing entries"""
return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
@pytest.fixture
def arr(arr_data):
"""Fixture returning SparseArray from 'arr_data'"""
return SparseArray(arr_data)
@pytest.fixture
def zarr():
"""Fixture returning SparseArray with integer entries and 'fill_value=0'"""
return SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
class TestSparseArray:
@pytest.mark.parametrize("fill_value", [0, None, np.nan])
def test_shift_fill_value(self, fill_value):
# GH #24128
sparse = SparseArray(np.array([1, 0, 0, 3, 0]), fill_value=8.0)
res = sparse.shift(1, fill_value=fill_value)
if isna(fill_value):
fill_value = res.dtype.na_value
exp = SparseArray(np.array([fill_value, 1, 0, 0, 3]), fill_value=8.0)
tm.assert_sp_array_equal(res, exp)
def test_set_fill_value(self):
arr = SparseArray([1.0, np.nan, 2.0], fill_value=np.nan)
arr.fill_value = 2
assert arr.fill_value == 2
arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64)
arr.fill_value = 2
assert arr.fill_value == 2
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 3.1
assert arr.fill_value == 3.1
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
arr.fill_value = True
assert arr.fill_value is True
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 0
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)
@pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)])
def test_set_fill_invalid_non_scalar(self, val):
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
msg = "fill_value must be a scalar"
with pytest.raises(ValueError, match=msg):
arr.fill_value = val
def test_copy(self, arr):
arr2 = arr.copy()
assert arr2.sp_values is not arr.sp_values
assert arr2.sp_index is arr.sp_index
def test_values_asarray(self, arr_data, arr):
tm.assert_almost_equal(arr.to_dense(), arr_data)
@pytest.mark.parametrize(
"data,shape,dtype",
[
([0, 0, 0, 0, 0], (5,), None),
([], (0,), None),
([0], (1,), None),
(["A", "A", np.nan, "B"], (4,), object),
],
)
def test_shape(self, data, shape, dtype):
# GH 21126
out = SparseArray(data, dtype=dtype)
assert out.shape == shape
@pytest.mark.parametrize(
"vals",
[
[np.nan, np.nan, np.nan, np.nan, np.nan],
[1, np.nan, np.nan, 3, np.nan],
[1, np.nan, 0, 3, 0],
],
)
@pytest.mark.parametrize("fill_value", [None, 0])
def test_dense_repr(self, vals, fill_value):
vals = np.array(vals)
arr = SparseArray(vals, fill_value=fill_value)
res = arr.to_dense()
tm.assert_numpy_array_equal(res, vals)
@pytest.mark.parametrize("fix", ["arr", "zarr"])
def test_pickle(self, fix, request):
obj = request.getfixturevalue(fix)
unpickled = tm.round_trip_pickle(obj)
tm.assert_sp_array_equal(unpickled, obj)
def test_generator_warnings(self):
sp_arr = SparseArray([1, 2, 3])
with tm.assert_produces_warning(None):
for _ in sp_arr:
pass
def test_where_retain_fill_value(self):
# GH#45691 don't lose fill_value on _where
arr = SparseArray([np.nan, 1.0], fill_value=0)
mask = np.array([True, False])
res = arr._where(~mask, 1)
exp = SparseArray([1, 1.0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
ser = pd.Series(arr)
res = ser.where(~mask, 1)
tm.assert_series_equal(res, pd.Series(exp))
def test_fillna(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0])
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan])
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0)
res = s.fillna(-1)
exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
# float dtype's fill_value is np.nan, replaced by -1
s = SparseArray([0.0, 0.0, 0.0, 0.0])
res = s.fillna(-1)
exp = SparseArray([0.0, 0.0, 0.0, 0.0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
# int dtype shouldn't have missing. No changes.
s = SparseArray([0, 0, 0, 0])
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
tm.assert_sp_array_equal(res, s)
s = SparseArray([0, 0, 0, 0], fill_value=0)
assert s.dtype == SparseDtype(np.int64)
assert s.fill_value == 0
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
# fill_value can be nan if there is no missing hole.
# only fill_value will be changed
s = SparseArray([0, 0, 0, 0], fill_value=np.nan)
assert s.dtype == SparseDtype(np.int64, fill_value=np.nan)
assert np.isnan(s.fill_value)
res = s.fillna(-1)
exp = SparseArray([0, 0, 0, 0], fill_value=-1)
tm.assert_sp_array_equal(res, exp)
def test_fillna_overlap(self):
s = SparseArray([1, np.nan, np.nan, 3, np.nan])
# filling with existing value doesn't replace existing value with
# fill_value, i.e. existing 3 remains in sp_values
res = s.fillna(3)
exp = np.array([1, 3, 3, 3, 3], dtype=np.float64)
tm.assert_numpy_array_equal(res.to_dense(), exp)
s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0)
res = s.fillna(3)
exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64)
tm.assert_sp_array_equal(res, exp)
def test_nonzero(self):
# Tests regression #21172.
sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
expected = np.array([2, 5, 9], dtype=np.int32)
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0])
(result,) = sa.nonzero()
tm.assert_numpy_array_equal(expected, result)
class TestSparseArrayAnalytics:
@pytest.mark.parametrize(
"data,expected",
[
(
np.array([1, 2, 3, 4, 5], dtype=float), # non-null data
SparseArray(np.array([1.0, 3.0, 6.0, 10.0, 15.0])),
),
(
np.array([1, 2, np.nan, 4, 5], dtype=float), # null data
SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])),
),
],
)
@pytest.mark.parametrize("numpy", [True, False])
def test_cumsum(self, data, expected, numpy):
cumsum = np.cumsum if numpy else lambda s: s.cumsum()
out = cumsum(SparseArray(data))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=np.nan))
tm.assert_sp_array_equal(out, expected)
out = cumsum(SparseArray(data, fill_value=2))
tm.assert_sp_array_equal(out, expected)
if numpy: # numpy compatibility checks.
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.cumsum(SparseArray(data), out=out)
else:
axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid.
msg = re.escape(f"axis(={axis}) out of bounds")
with pytest.raises(ValueError, match=msg):
SparseArray(data).cumsum(axis=axis)
def test_ufunc(self):
# GH 13853 make sure ufunc is applied to fill_value
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([1, np.nan, 2, np.nan, 2])
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, fill_value=1)
tm.assert_sp_array_equal(abs(sparse), result)
tm.assert_sp_array_equal(np.abs(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=-1)
exp = SparseArray([1, 1, 2, 2], fill_value=1)
tm.assert_sp_array_equal(abs(sparse), exp)
tm.assert_sp_array_equal(np.abs(sparse), exp)
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2]))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1))
tm.assert_sp_array_equal(np.sin(sparse), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0))
tm.assert_sp_array_equal(np.sin(sparse), result)
def test_ufunc_args(self):
# GH 13853 make sure ufunc is applied to fill_value, including its arg
sparse = SparseArray([1, np.nan, 2, np.nan, -2])
result = SparseArray([2, np.nan, 3, np.nan, -1])
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 2, -2], fill_value=1)
result = SparseArray([2, 0, 3, -1], fill_value=2)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
sparse = SparseArray([1, -1, 0, -2], fill_value=0)
result = SparseArray([2, 0, 1, -1], fill_value=1)
tm.assert_sp_array_equal(np.add(sparse, 1), result)
@pytest.mark.parametrize("fill_value", [0.0, np.nan])
def test_modf(self, fill_value):
# https://github.com/pandas-dev/pandas/issues/26946
sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value)
r1, r2 = np.modf(sparse)
e1, e2 = np.modf(np.asarray(sparse))
tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value))
tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value))
def test_nbytes_integer(self):
arr = SparseArray([1, 0, 0, 0, 2], kind="integer")
result = arr.nbytes
# (2 * 8) + 2 * 4
assert result == 24
def test_nbytes_block(self):
arr = SparseArray([1, 2, 0, 0, 0], kind="block")
result = arr.nbytes
# (2 * 8) + 4 + 4
# sp_values, blocs, blengths
assert result == 24
def test_asarray_datetime64(self):
s = SparseArray(pd.to_datetime(["2012", None, None, "2013"]))
np.asarray(s)
def test_density(self):
arr = SparseArray([0, 1])
assert arr.density == 0.5
def test_npoints(self):
arr = SparseArray([0, 1])
assert arr.npoints == 1
def test_setting_fill_value_fillna_still_works():
# This is why letting users update fill_value / dtype is bad
# astype has the same problem.
arr = SparseArray([1.0, np.nan, 1.0], fill_value=0.0)
arr.fill_value = np.nan
result = arr.isna()
# Can't do direct comparison, since the sp_index will be different
# So let's convert to ndarray and check there.
result = np.asarray(result)
expected = np.array([False, True, False])
tm.assert_numpy_array_equal(result, expected)
def test_setting_fill_value_updates():
arr = SparseArray([0.0, np.nan], fill_value=0)
arr.fill_value = np.nan
# use private constructor to get the index right
# otherwise both nans would be un-stored.
expected = SparseArray._simple_new(
sparse_array=np.array([np.nan]),
sparse_index=IntIndex(2, [1]),
dtype=SparseDtype(float, np.nan),
)
tm.assert_sp_array_equal(arr, expected)
@pytest.mark.parametrize(
"arr,fill_value,loc",
[
([None, 1, 2], None, 0),
([0, None, 2], None, 1),
([0, 1, None], None, 2),
([0, 1, 1, None, None], None, 3),
([1, 1, 1, 2], None, -1),
([], None, -1),
([None, 1, 0, 0, None, 2], None, 0),
([None, 1, 0, 0, None, 2], 1, 1),
([None, 1, 0, 0, None, 2], 2, 5),
([None, 1, 0, 0, None, 2], 3, -1),
([None, 0, 0, 1, 2, 1], 0, 1),
([None, 0, 0, 1, 2, 1], 1, 3),
],
)
def test_first_fill_value_loc(arr, fill_value, loc):
result = SparseArray(arr, fill_value=fill_value)._first_fill_value_loc()
assert result == loc
@pytest.mark.parametrize(
"arr",
[
[1, 2, np.nan, np.nan],
[1, np.nan, 2, np.nan],
[1, 2, np.nan],
[np.nan, 1, 0, 0, np.nan, 2],
[np.nan, 0, 0, 1, 2, 1],
],
)
@pytest.mark.parametrize("fill_value", [np.nan, 0, 1])
def test_unique_na_fill(arr, fill_value):
a = SparseArray(arr, fill_value=fill_value).unique()
b = pd.Series(arr).unique()
assert isinstance(a, SparseArray)
a = np.asarray(a)
tm.assert_numpy_array_equal(a, b)
def test_unique_all_sparse():
# https://github.com/pandas-dev/pandas/issues/23168
arr = SparseArray([0, 0])
result = arr.unique()
expected = SparseArray([0])
tm.assert_sp_array_equal(result, expected)
def test_map():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, 12], fill_value=10)
# dict
result = arr.map({0: 10, 1: 11, 2: 12})
tm.assert_sp_array_equal(result, expected)
# series
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
tm.assert_sp_array_equal(result, expected)
# function
result = arr.map(pd.Series({0: 10, 1: 11, 2: 12}))
expected = SparseArray([10, 11, 12], fill_value=10)
tm.assert_sp_array_equal(result, expected)
def test_map_missing():
arr = SparseArray([0, 1, 2])
expected = SparseArray([10, 11, None], fill_value=10)
result = arr.map({0: 10, 1: 11})
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [np.nan, 1])
def test_dropna(fill_value):
# GH-28287
arr = SparseArray([np.nan, 1], fill_value=fill_value)
exp = SparseArray([1.0], fill_value=fill_value)
tm.assert_sp_array_equal(arr.dropna(), exp)
df = pd.DataFrame({"a": [0, 1], "b": arr})
expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Index([1]))
tm.assert_equal(df.dropna(), expected_df)
def test_drop_duplicates_fill_value():
# GH 11726
df = pd.DataFrame(np.zeros((5, 5))).apply(lambda x: SparseArray(x, fill_value=0))
result = df.drop_duplicates()
expected = pd.DataFrame({i: SparseArray([0.0], fill_value=0) for i in range(5)})
tm.assert_frame_equal(result, expected)
def test_zero_sparse_column():
# GH 27781
df1 = pd.DataFrame({"A": SparseArray([0, 0, 0]), "B": [1, 2, 3]})
df2 = pd.DataFrame({"A": SparseArray([0, 1, 0]), "B": [1, 2, 3]})
result = df1.loc[df1["B"] != 2]
expected = df2.loc[df2["B"] != 2]
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,133 @@
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
from pandas import (
SparseDtype,
Timestamp,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestAstype:
def test_astype(self):
# float -> float
arr = SparseArray([None, None, 0, 2])
result = arr.astype("Sparse[float32]")
expected = SparseArray([None, None, 0, 2], dtype=np.dtype("float32"))
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("float64", fill_value=0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0.0, 2.0], dtype=dtype.subtype), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
dtype = SparseDtype("int64", 0)
result = arr.astype(dtype)
expected = SparseArray._simple_new(
np.array([0, 2], dtype=np.int64), IntIndex(4, [2, 3]), dtype
)
tm.assert_sp_array_equal(result, expected)
arr = SparseArray([0, np.nan, 0, 1], fill_value=0)
with pytest.raises(ValueError, match="NA"):
arr.astype("Sparse[i8]")
def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
expected = np.array([1, 0, 0, 1], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# update fill value
result = a.astype(SparseDtype(bool, False))
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
tm.assert_sp_array_equal(result, expected)
def test_astype_all(self, any_real_numpy_dtype):
vals = np.array([1, 2, 3])
arr = SparseArray(vals, fill_value=1)
typ = np.dtype(any_real_numpy_dtype)
res = arr.astype(typ)
tm.assert_numpy_array_equal(res, vals.astype(any_real_numpy_dtype))
@pytest.mark.parametrize(
"arr, dtype, expected",
[
(
SparseArray([0, 1]),
"float",
SparseArray([0.0, 1.0], dtype=SparseDtype(float, 0.0)),
),
(SparseArray([0, 1]), bool, SparseArray([False, True])),
(
SparseArray([0, 1], fill_value=1),
bool,
SparseArray([False, True], dtype=SparseDtype(bool, True)),
),
pytest.param(
SparseArray([0, 1]),
"datetime64[ns]",
SparseArray(
np.array([0, 1], dtype="datetime64[ns]"),
dtype=SparseDtype("datetime64[ns]", Timestamp("1970")),
),
),
(
SparseArray([0, 1, 10]),
str,
SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
),
(SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
(
SparseArray([0, 1, 0]),
object,
SparseArray([0, 1, 0], dtype=SparseDtype(object, 0)),
),
],
)
def test_astype_more(self, arr, dtype, expected):
result = arr.astype(arr.dtype.update_dtype(dtype))
tm.assert_sp_array_equal(result, expected)
def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match="Cannot convert non-finite"):
arr.astype(int)
def test_astype_copy_false(self):
# GH#34456 bug caused by using .view instead of .astype in astype_nansafe
arr = SparseArray([1, 2, 3])
dtype = SparseDtype(float, 0)
result = arr.astype(dtype, copy=False)
expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
tm.assert_sp_array_equal(result, expected)
def test_astype_dt64_to_int64(self):
# GH#49631 match non-sparse behavior
values = np.array(["NaT", "2016-01-02", "2016-01-03"], dtype="M8[ns]")
arr = SparseArray(values)
result = arr.astype("int64")
expected = values.astype("int64")
tm.assert_numpy_array_equal(result, expected)
# we should also be able to cast to equivalent Sparse[int64]
dtype_int64 = SparseDtype("int64", np.iinfo(np.int64).min)
result2 = arr.astype(dtype_int64)
tm.assert_numpy_array_equal(result2.to_numpy(), expected)
# GH#50087 we should match the non-sparse behavior regardless of
# if we have a fill_value other than NaT
dtype = SparseDtype("datetime64[ns]", values[1])
arr3 = SparseArray(values, dtype=dtype)
result3 = arr3.astype("int64")
tm.assert_numpy_array_equal(result3, expected)

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestSparseArrayConcat:
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_basic(self, kind):
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=kind)
result = SparseArray._concat_same_type([a, b])
# Can't make any assertions about the sparse index itself
# since we aren't don't merge sparse blocs across arrays
# in to_concat
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_uses_first_kind(self, kind):
other = "integer" if kind == "block" else "block"
a = SparseArray([1, 0, 0, 2], kind=kind)
b = SparseArray([1, 0, 2, 2], kind=other)
result = SparseArray._concat_same_type([a, b])
expected = np.array([1, 2, 1, 2, 2], dtype="int64")
tm.assert_numpy_array_equal(result.sp_values, expected)
assert result.kind == kind
@pytest.mark.parametrize(
"other, expected_dtype",
[
# compatible dtype -> preserve sparse
(pd.Series([3, 4, 5], dtype="int64"), pd.SparseDtype("int64", 0)),
# (pd.Series([3, 4, 5], dtype="Int64"), pd.SparseDtype("int64", 0)),
# incompatible dtype -> Sparse[common dtype]
(pd.Series([1.5, 2.5, 3.5], dtype="float64"), pd.SparseDtype("float64", 0)),
# incompatible dtype -> Sparse[object] dtype
(pd.Series(["a", "b", "c"], dtype=object), pd.SparseDtype(object, 0)),
# categorical with compatible categories -> dtype of the categories
(pd.Series([3, 4, 5], dtype="category"), np.dtype("int64")),
(pd.Series([1.5, 2.5, 3.5], dtype="category"), np.dtype("float64")),
# categorical with incompatible categories -> object dtype
(pd.Series(["a", "b", "c"], dtype="category"), np.dtype(object)),
],
)
def test_concat_with_non_sparse(other, expected_dtype):
# https://github.com/pandas-dev/pandas/issues/34336
s_sparse = pd.Series([1, 0, 2], dtype=pd.SparseDtype("int64", 0))
result = pd.concat([s_sparse, other], ignore_index=True)
expected = pd.Series(list(s_sparse) + list(other)).astype(expected_dtype)
tm.assert_series_equal(result, expected)
result = pd.concat([other, s_sparse], ignore_index=True)
expected = pd.Series(list(other) + list(s_sparse)).astype(expected_dtype)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,285 @@
import numpy as np
import pytest
from pandas._libs.sparse import IntIndex
import pandas as pd
from pandas import (
SparseDtype,
isna,
)
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
class TestConstructors:
def test_constructor_dtype(self):
arr = SparseArray([np.nan, 1, 2, np.nan])
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert arr.dtype.subtype == np.float64
assert np.isnan(arr.fill_value)
arr = SparseArray([np.nan, 1, 2, np.nan], fill_value=0)
assert arr.dtype == SparseDtype(np.float64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=np.float64)
assert arr.dtype == SparseDtype(np.float64, np.nan)
assert np.isnan(arr.fill_value)
arr = SparseArray([0, 1, 2, 4], dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None)
assert arr.dtype == SparseDtype(np.int64, 0)
assert arr.fill_value == 0
def test_constructor_dtype_str(self):
result = SparseArray([1, 2, 3], dtype="int")
expected = SparseArray([1, 2, 3], dtype=int)
tm.assert_sp_array_equal(result, expected)
def test_constructor_sparse_dtype(self):
result = SparseArray([1, 0, 0, 1], dtype=SparseDtype("int64", -1))
expected = SparseArray([1, 0, 0, 1], fill_value=-1, dtype=np.int64)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int64")
def test_constructor_sparse_dtype_str(self):
result = SparseArray([1, 0, 0, 1], dtype="Sparse[int32]")
expected = SparseArray([1, 0, 0, 1], dtype=np.int32)
tm.assert_sp_array_equal(result, expected)
assert result.sp_values.dtype == np.dtype("int32")
def test_constructor_object_dtype(self):
# GH#11856
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object)
assert arr.dtype == SparseDtype(object)
assert np.isnan(arr.fill_value)
arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A")
assert arr.dtype == SparseDtype(object, "A")
assert arr.fill_value == "A"
def test_constructor_object_dtype_bool_fill(self):
# GH#17574
data = [False, 0, 100.0, 0.0]
arr = SparseArray(data, dtype=object, fill_value=False)
assert arr.dtype == SparseDtype(object, False)
assert arr.fill_value is False
arr_expected = np.array(data, dtype=object)
it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
assert np.fromiter(it, dtype=np.bool_).all()
@pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int])
def test_constructor_na_dtype(self, dtype):
with pytest.raises(ValueError, match="Cannot convert"):
SparseArray([0, 1, np.nan], dtype=dtype)
def test_constructor_warns_when_losing_timezone(self):
# GH#32501 warn when losing timezone information
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
expected = SparseArray(np.asarray(dti, dtype="datetime64[ns]"))
with tm.assert_produces_warning(UserWarning):
result = SparseArray(dti)
tm.assert_sp_array_equal(result, expected)
with tm.assert_produces_warning(UserWarning):
result = SparseArray(pd.Series(dti))
tm.assert_sp_array_equal(result, expected)
def test_constructor_spindex_dtype(self):
arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
# TODO: actionable?
# XXX: Behavior change: specifying SparseIndex no longer changes the
# fill_value
expected = SparseArray([0, 1, 2, 0], kind="integer")
tm.assert_sp_array_equal(arr, expected)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=np.int64,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=np.int64)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
arr = SparseArray(
data=[1, 2, 3],
sparse_index=IntIndex(4, [1, 2, 3]),
dtype=None,
fill_value=0,
)
exp = SparseArray([0, 1, 2, 3], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize("sparse_index", [None, IntIndex(1, [0])])
def test_constructor_spindex_dtype_scalar(self, sparse_index):
# scalar input
msg = "Constructing SparseArray with scalar data is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr = SparseArray(data=1, sparse_index=sparse_index, dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
with tm.assert_produces_warning(FutureWarning, match=msg):
arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None)
exp = SparseArray([1], dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
def test_constructor_spindex_dtype_scalar_broadcasts(self):
arr = SparseArray(
data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None
)
exp = SparseArray([0, 1, 2, 0], fill_value=0, dtype=None)
tm.assert_sp_array_equal(arr, exp)
assert arr.dtype == SparseDtype(np.int64)
assert arr.fill_value == 0
@pytest.mark.parametrize(
"data, fill_value",
[
(np.array([1, 2]), 0),
(np.array([1.0, 2.0]), np.nan),
([True, False], False),
([pd.Timestamp("2017-01-01")], pd.NaT),
],
)
def test_constructor_inferred_fill_value(self, data, fill_value):
result = SparseArray(data).fill_value
if isna(fill_value):
assert isna(result)
else:
assert result == fill_value
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
@pytest.mark.parametrize("size", [0, 10])
def test_from_spmatrix(self, size, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(size, 1, density=0.5, format=format)
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
def test_from_spmatrix_including_explicit_zero(self, format):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.random(10, 1, density=0.5, format=format)
mat.data[0] = 0
result = SparseArray.from_spmatrix(mat)
result = np.asarray(result)
expected = mat.toarray().ravel()
tm.assert_numpy_array_equal(result, expected)
def test_from_spmatrix_raises(self):
sp_sparse = pytest.importorskip("scipy.sparse")
mat = sp_sparse.eye(5, 4, format="csc")
with pytest.raises(ValueError, match="not '4'"):
SparseArray.from_spmatrix(mat)
def test_constructor_from_too_large_array(self):
with pytest.raises(TypeError, match="expected dimension <= 1 data"):
SparseArray(np.arange(10).reshape((2, 5)))
def test_constructor_from_sparse(self):
zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
res = SparseArray(zarr)
assert res.fill_value == 0
tm.assert_almost_equal(res.sp_values, zarr.sp_values)
def test_constructor_copy(self):
arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
arr = SparseArray(arr_data)
cp = SparseArray(arr, copy=True)
cp.sp_values[:3] = 0
assert not (arr.sp_values[:3] == 0).any()
not_copy = SparseArray(arr)
not_copy.sp_values[:3] = 0
assert (arr.sp_values[:3] == 0).all()
def test_constructor_bool(self):
# GH#10648
data = np.array([False, False, True, True, False, False])
arr = SparseArray(data, fill_value=False, dtype=bool)
assert arr.dtype == SparseDtype(bool)
tm.assert_numpy_array_equal(arr.sp_values, np.array([True, True]))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(arr.sp_index.indices, np.array([2, 3], np.int32))
dense = arr.to_dense()
assert dense.dtype == bool
tm.assert_numpy_array_equal(dense, data)
def test_constructor_bool_fill_value(self):
arr = SparseArray([True, False, True], dtype=None)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_)
assert arr.dtype == SparseDtype(np.bool_)
assert not arr.fill_value
arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True)
assert arr.dtype == SparseDtype(np.bool_, True)
assert arr.fill_value
def test_constructor_float32(self):
# GH#10648
data = np.array([1.0, np.nan, 3], dtype=np.float32)
arr = SparseArray(data, dtype=np.float32)
assert arr.dtype == SparseDtype(np.float32)
tm.assert_numpy_array_equal(arr.sp_values, np.array([1, 3], dtype=np.float32))
# Behavior change: np.asarray densifies.
# tm.assert_numpy_array_equal(arr.sp_values, np.asarray(arr))
tm.assert_numpy_array_equal(
arr.sp_index.indices, np.array([0, 2], dtype=np.int32)
)
dense = arr.to_dense()
assert dense.dtype == np.float32
tm.assert_numpy_array_equal(dense, data)

View File

@ -0,0 +1,224 @@
import re
import warnings
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", 0),
("float", np.nan),
("bool", False),
("object", np.nan),
("datetime64[ns]", np.datetime64("NaT", "ns")),
("timedelta64[ns]", np.timedelta64("NaT", "ns")),
],
)
def test_inferred_dtype(dtype, fill_value):
sparse_dtype = SparseDtype(dtype)
result = sparse_dtype.fill_value
if pd.isna(fill_value):
assert pd.isna(result) and type(result) == type(fill_value)
else:
assert result == fill_value
def test_from_sparse_dtype():
dtype = SparseDtype("float", 0)
result = SparseDtype(dtype)
assert result.fill_value == 0
def test_from_sparse_dtype_fill_value():
dtype = SparseDtype("int", 1)
result = SparseDtype(dtype, fill_value=2)
expected = SparseDtype("int", 2)
assert result == expected
@pytest.mark.parametrize(
"dtype, fill_value",
[
("int", None),
("float", None),
("bool", None),
("object", None),
("datetime64[ns]", None),
("timedelta64[ns]", None),
("int", np.nan),
("float", 0),
],
)
def test_equal(dtype, fill_value):
a = SparseDtype(dtype, fill_value)
b = SparseDtype(dtype, fill_value)
assert a == b
assert b == a
def test_nans_equal():
a = SparseDtype(float, float("nan"))
b = SparseDtype(float, np.nan)
assert a == b
assert b == a
with warnings.catch_warnings():
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
warnings.filterwarnings("ignore", msg, category=FutureWarning)
tups = [
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
]
@pytest.mark.parametrize(
"a, b",
tups,
)
def test_not_equal(a, b):
assert a != b
def test_construct_from_string_raises():
with pytest.raises(
TypeError, match="Cannot construct a 'SparseDtype' from 'not a dtype'"
):
SparseDtype.construct_from_string("not a dtype")
@pytest.mark.parametrize(
"dtype, expected",
[
(SparseDtype(int), True),
(SparseDtype(float), True),
(SparseDtype(bool), True),
(SparseDtype(object), False),
(SparseDtype(str), False),
],
)
def test_is_numeric(dtype, expected):
assert dtype._is_numeric is expected
def test_str_uses_object():
result = SparseDtype(str).subtype
assert result == np.dtype("object")
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[float64]", SparseDtype(np.dtype("float64"))),
("Sparse[float32]", SparseDtype(np.dtype("float32"))),
("Sparse[int]", SparseDtype(np.dtype("int"))),
("Sparse[str]", SparseDtype(np.dtype("str"))),
("Sparse[datetime64[ns]]", SparseDtype(np.dtype("datetime64[ns]"))),
("Sparse", SparseDtype(np.dtype("float"), np.nan)),
],
)
def test_construct_from_string(string, expected):
result = SparseDtype.construct_from_string(string)
assert result == expected
@pytest.mark.parametrize(
"a, b, expected",
[
(SparseDtype(float, 0.0), SparseDtype(np.dtype("float"), 0.0), True),
(SparseDtype(int, 0), SparseDtype(int, 0), True),
(SparseDtype(float, float("nan")), SparseDtype(float, np.nan), True),
(SparseDtype(float, 0), SparseDtype(float, np.nan), False),
(SparseDtype(int, 0.0), SparseDtype(float, 0.0), False),
],
)
def test_hash_equal(a, b, expected):
result = a == b
assert result is expected
result = hash(a) == hash(b)
assert result is expected
@pytest.mark.parametrize(
"string, expected",
[
("Sparse[int]", "int"),
("Sparse[int, 0]", "int"),
("Sparse[int64]", "int64"),
("Sparse[int64, 0]", "int64"),
("Sparse[datetime64[ns], 0]", "datetime64[ns]"),
],
)
def test_parse_subtype(string, expected):
subtype, _ = SparseDtype._parse_subtype(string)
assert subtype == expected
@pytest.mark.parametrize(
"string", ["Sparse[int, 1]", "Sparse[float, 0.0]", "Sparse[bool, True]"]
)
def test_construct_from_string_fill_value_raises(string):
with pytest.raises(TypeError, match="fill_value in the string is not"):
SparseDtype.construct_from_string(string)
@pytest.mark.parametrize(
"original, dtype, expected",
[
(SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
(SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
(SparseDtype(int, 1), str, SparseDtype(object, "1")),
(SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
],
)
def test_update_dtype(original, dtype, expected):
result = original.update_dtype(dtype)
assert result == expected
@pytest.mark.parametrize(
"original, dtype, expected_error_msg",
[
(
SparseDtype(float, np.nan),
int,
re.escape("Cannot convert non-finite values (NA or inf) to integer"),
),
(
SparseDtype(str, "abc"),
int,
r"invalid literal for int\(\) with base 10: ('abc'|np\.str_\('abc'\))",
),
],
)
def test_update_dtype_raises(original, dtype, expected_error_msg):
with pytest.raises(ValueError, match=expected_error_msg):
original.update_dtype(dtype)
def test_repr():
# GH-34352
result = str(SparseDtype("int64", fill_value=0))
expected = "Sparse[int64, 0]"
assert result == expected
result = str(SparseDtype(object, fill_value="0"))
expected = "Sparse[object, '0']"
assert result == expected
def test_sparse_dtype_subtype_must_be_numpy_dtype():
# GH#53160
msg = "SparseDtype subtype must be a numpy dtype"
with pytest.raises(TypeError, match=msg):
SparseDtype("category", fill_value="c")

View File

@ -0,0 +1,302 @@
import numpy as np
import pytest
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@pytest.fixture
def arr_data():
return np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
@pytest.fixture
def arr(arr_data):
return SparseArray(arr_data)
class TestGetitem:
def test_getitem(self, arr):
dense = arr.to_dense()
for i, value in enumerate(arr):
tm.assert_almost_equal(value, dense[i])
tm.assert_almost_equal(arr[-i], dense[-i])
def test_getitem_arraylike_mask(self, arr):
arr = SparseArray([0, 1, 2])
result = arr[[True, False, True]]
expected = SparseArray([0, 2])
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize(
"slc",
[
np.s_[:],
np.s_[1:10],
np.s_[1:100],
np.s_[10:1],
np.s_[:-3],
np.s_[-5:-4],
np.s_[:-12],
np.s_[-12:],
np.s_[2:],
np.s_[2::3],
np.s_[::2],
np.s_[::-1],
np.s_[::-2],
np.s_[1:6:2],
np.s_[:-6:-2],
],
)
@pytest.mark.parametrize(
"as_dense", [[np.nan] * 10, [1] * 10, [np.nan] * 5 + [1] * 5, []]
)
def test_getslice(self, slc, as_dense):
as_dense = np.array(as_dense)
arr = SparseArray(as_dense)
result = arr[slc]
expected = SparseArray(as_dense[slc])
tm.assert_sp_array_equal(result, expected)
def test_getslice_tuple(self):
dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0])
sparse = SparseArray(dense)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:])
tm.assert_sp_array_equal(res, exp)
sparse = SparseArray(dense, fill_value=0)
res = sparse[(slice(4, None),)]
exp = SparseArray(dense[4:], fill_value=0)
tm.assert_sp_array_equal(res, exp)
msg = "too many indices for array"
with pytest.raises(IndexError, match=msg):
sparse[4:, :]
with pytest.raises(IndexError, match=msg):
# check numpy compat
dense[4:, :]
def test_boolean_slice_empty(self):
arr = SparseArray([0, 1, 2])
res = arr[[False, False, False]]
assert res.dtype == arr.dtype
def test_getitem_bool_sparse_array(self, arr):
# GH 23122
spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True)
exp = SparseArray([np.nan, 2, np.nan, 5, 6])
tm.assert_sp_array_equal(arr[spar_bool], exp)
spar_bool = ~spar_bool
res = arr[spar_bool]
exp = SparseArray([np.nan, 1, 3, 4, np.nan])
tm.assert_sp_array_equal(res, exp)
spar_bool = SparseArray(
[False, True, np.nan] * 3, dtype=np.bool_, fill_value=np.nan
)
res = arr[spar_bool]
exp = SparseArray([np.nan, 3, 5])
tm.assert_sp_array_equal(res, exp)
def test_getitem_bool_sparse_array_as_comparison(self):
# GH 45110
arr = SparseArray([1, 2, 3, 4, np.nan, np.nan], fill_value=np.nan)
res = arr[arr > 2]
exp = SparseArray([3.0, 4.0], fill_value=np.nan)
tm.assert_sp_array_equal(res, exp)
def test_get_item(self, arr):
zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)
assert np.isnan(arr[1])
assert arr[2] == 1
assert arr[7] == 5
assert zarr[0] == 0
assert zarr[2] == 1
assert zarr[7] == 5
errmsg = "must be an integer between -10 and 10"
with pytest.raises(IndexError, match=errmsg):
arr[11]
with pytest.raises(IndexError, match=errmsg):
arr[-11]
assert arr[-1] == arr[len(arr) - 1]
class TestSetitem:
def test_set_item(self, arr_data):
arr = SparseArray(arr_data).copy()
def setitem():
arr[5] = 3
def setslice():
arr[1:5] = 2
with pytest.raises(TypeError, match="assignment via setitem"):
setitem()
with pytest.raises(TypeError, match="assignment via setitem"):
setslice()
class TestTake:
def test_take_scalar_raises(self, arr):
msg = "'indices' must be an array, not a scalar '2'."
with pytest.raises(ValueError, match=msg):
arr.take(2)
def test_take(self, arr_data, arr):
exp = SparseArray(np.take(arr_data, [2, 3]))
tm.assert_sp_array_equal(arr.take([2, 3]), exp)
exp = SparseArray(np.take(arr_data, [0, 1, 2]))
tm.assert_sp_array_equal(arr.take([0, 1, 2]), exp)
def test_take_all_empty(self):
sparse = pd.array([0, 0], dtype=SparseDtype("int64"))
result = sparse.take([0, 1], allow_fill=True, fill_value=np.nan)
tm.assert_sp_array_equal(sparse, result)
def test_take_different_fill_value(self):
# Take with a different fill value shouldn't overwrite the original
sparse = pd.array([0.0], dtype=SparseDtype("float64", fill_value=0.0))
result = sparse.take([0, -1], allow_fill=True, fill_value=np.nan)
expected = pd.array([0, np.nan], dtype=sparse.dtype)
tm.assert_sp_array_equal(expected, result)
def test_take_fill_value(self):
data = np.array([1, np.nan, 0, 3, 0])
sparse = SparseArray(data, fill_value=0)
exp = SparseArray(np.take(data, [0]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([0]), exp)
exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0)
tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp)
def test_take_negative(self, arr_data, arr):
exp = SparseArray(np.take(arr_data, [-1]))
tm.assert_sp_array_equal(arr.take([-1]), exp)
exp = SparseArray(np.take(arr_data, [-4, -3, -2]))
tm.assert_sp_array_equal(arr.take([-4, -3, -2]), exp)
def test_bad_take(self, arr):
with pytest.raises(IndexError, match="bounds"):
arr.take([11])
def test_take_filling(self):
# similar tests as GH 12631
sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4])
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
# TODO: actionable?
# XXX: test change: fill_value=True -> allow_fill=True
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
expected = SparseArray([np.nan, np.nan, np.nan])
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([np.nan, np.nan, 4])
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'"
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), allow_fill=True)
def test_take_filling_fill_value(self):
# same tests as GH#12631
sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# fill_value
result = sparse.take(np.array([1, 0, -1]), allow_fill=True)
# TODO: actionable?
# XXX: behavior change.
# the old way of filling self.fill_value doesn't follow EA rules.
# It's supposed to be self.dtype.na_value (nan in this case)
expected = SparseArray([0, np.nan, np.nan], fill_value=0)
tm.assert_sp_array_equal(result, expected)
# allow_fill=False
result = sparse.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
expected = SparseArray([0, np.nan, 4], fill_value=0)
tm.assert_sp_array_equal(result, expected)
msg = "Invalid value in 'indices'."
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -2]), allow_fill=True)
with pytest.raises(ValueError, match=msg):
sparse.take(np.array([1, 0, -5]), allow_fill=True)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
@pytest.mark.parametrize("kind", ["block", "integer"])
def test_take_filling_all_nan(self, kind):
sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan], kind=kind)
result = sparse.take(np.array([1, 0, -1]))
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
result = sparse.take(np.array([1, 0, -1]), fill_value=True)
expected = SparseArray([np.nan, np.nan, np.nan], kind=kind)
tm.assert_sp_array_equal(result, expected)
msg = "out of bounds value in 'indices'"
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, -6]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]))
with pytest.raises(IndexError, match=msg):
sparse.take(np.array([1, 5]), fill_value=True)
class TestWhere:
def test_where_retain_fill_value(self):
# GH#45691 don't lose fill_value on _where
arr = SparseArray([np.nan, 1.0], fill_value=0)
mask = np.array([True, False])
res = arr._where(~mask, 1)
exp = SparseArray([1, 1.0], fill_value=0)
tm.assert_sp_array_equal(res, exp)
ser = pd.Series(arr)
res = ser.where(~mask, 1)
tm.assert_series_equal(res, pd.Series(exp))

View File

@ -0,0 +1,551 @@
import operator
import numpy as np
import pytest
import pandas._libs.sparse as splib
import pandas.util._test_decorators as td
from pandas import Series
import pandas._testing as tm
from pandas.core.arrays.sparse import (
BlockIndex,
IntIndex,
make_sparse_index,
)
@pytest.fixture
def test_length():
return 20
@pytest.fixture(
params=[
[
[0, 7, 15],
[3, 5, 5],
[2, 9, 14],
[2, 3, 5],
[2, 9, 15],
[1, 3, 4],
],
[
[0, 5],
[4, 4],
[1],
[4],
[1],
[3],
],
[
[0],
[10],
[0, 5],
[3, 7],
[0, 5],
[3, 5],
],
[
[10],
[5],
[0, 12],
[5, 3],
[12],
[3],
],
[
[0, 10],
[4, 6],
[5, 17],
[4, 2],
[],
[],
],
[
[0],
[5],
[],
[],
[],
[],
],
],
ids=[
"plain_case",
"delete_blocks",
"split_blocks",
"skip_block",
"no_intersect",
"one_empty",
],
)
def cases(request):
return request.param
class TestSparseIndexUnion:
@pytest.mark.parametrize(
"xloc, xlen, yloc, ylen, eloc, elen",
[
[[0], [5], [5], [4], [0], [9]],
[[0, 10], [5, 5], [2, 17], [5, 2], [0, 10, 17], [7, 5, 2]],
[[1], [5], [3], [5], [1], [7]],
[[2, 10], [4, 4], [4], [8], [2], [12]],
[[0, 5], [3, 5], [0], [7], [0], [10]],
[[2, 10], [4, 4], [4, 13], [8, 4], [2], [15]],
[[2], [15], [4, 9, 14], [3, 2, 2], [2], [15]],
[[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]],
],
)
def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen, test_length):
# Case 1
# x: ----
# y: ----
# r: --------
# Case 2
# x: ----- -----
# y: ----- --
# Case 3
# x: ------
# y: -------
# r: ----------
# Case 4
# x: ------ -----
# y: -------
# r: -------------
# Case 5
# x: --- -----
# y: -------
# r: -------------
# Case 6
# x: ------ -----
# y: ------- ---
# r: -------------
# Case 7
# x: ----------------------
# y: ---- ---- ---
# r: ----------------------
# Case 8
# x: ---- ---
# y: --- ---
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
bresult = xindex.make_union(yindex)
assert isinstance(bresult, BlockIndex)
tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
tm.assert_numpy_array_equal(bresult.blengths, np.array(elen, dtype=np.int32))
ixindex = xindex.to_int_index()
iyindex = yindex.to_int_index()
iresult = ixindex.make_union(iyindex)
assert isinstance(iresult, IntIndex)
tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
def test_int_index_make_union(self):
a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([0, 2], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 2], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([], dtype=np.int32))
b = IntIndex(5, np.array([], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
res = a.make_union(b)
exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
assert res.equals(exp)
a = IntIndex(5, np.array([0, 1], dtype=np.int32))
b = IntIndex(4, np.array([0, 1], dtype=np.int32))
msg = "Indices must reference same underlying length"
with pytest.raises(ValueError, match=msg):
a.make_union(b)
class TestSparseIndexIntersect:
@td.skip_if_windows
def test_intersect(self, cases, test_length):
xloc, xlen, yloc, ylen, eloc, elen = cases
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
expected = BlockIndex(test_length, eloc, elen)
longer_index = BlockIndex(test_length + 1, yloc, ylen)
result = xindex.intersect(yindex)
assert result.equals(expected)
result = xindex.to_int_index().intersect(yindex.to_int_index())
assert result.equals(expected.to_int_index())
msg = "Indices must reference same underlying length"
with pytest.raises(Exception, match=msg):
xindex.intersect(longer_index)
with pytest.raises(Exception, match=msg):
xindex.to_int_index().intersect(longer_index.to_int_index())
def test_intersect_empty(self):
xindex = IntIndex(4, np.array([], dtype=np.int32))
yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
xindex = xindex.to_block_index()
yindex = yindex.to_block_index()
assert xindex.intersect(yindex).equals(xindex)
assert yindex.intersect(xindex).equals(xindex)
@pytest.mark.parametrize(
"case",
[
# Argument 2 to "IntIndex" has incompatible type "ndarray[Any,
# dtype[signedinteger[_32Bit]]]"; expected "Sequence[int]"
IntIndex(5, np.array([1, 2], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(0, np.array([], dtype=np.int32)), # type: ignore[arg-type]
IntIndex(5, np.array([], dtype=np.int32)), # type: ignore[arg-type]
],
)
def test_intersect_identical(self, case):
assert case.intersect(case).equals(case)
case = case.to_block_index()
assert case.intersect(case).equals(case)
class TestSparseIndexCommon:
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_lookup(self, kind):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
assert idx.lookup(2) == 0
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
assert idx.lookup(2) == 2
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
assert idx.lookup(2) == 1
assert idx.lookup(3) == 2
assert idx.lookup(4) == -1
@pytest.mark.parametrize("kind", ["integer", "block"])
def test_lookup_array(self, kind):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
exp = np.array([-1, -1, 1, -1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@pytest.mark.parametrize(
"idx, expected",
[
[0, -1],
[5, 0],
[7, 2],
[8, -1],
[9, -1],
[10, -1],
[11, -1],
[12, 3],
[17, 8],
[18, -1],
],
)
def test_lookup_basics(self, idx, expected):
bindex = BlockIndex(20, [5, 12], [3, 6])
assert bindex.lookup(idx) == expected
iindex = bindex.to_int_index()
assert iindex.lookup(idx) == expected
class TestBlockIndex:
def test_block_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
@pytest.mark.parametrize("i", [5, 10, 100, 101])
def test_make_block_boundary(self, i):
idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
def test_equals(self):
index = BlockIndex(10, [0, 4], [2, 5])
assert index.equals(index)
assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
def test_check_integrity(self):
locs = []
lengths = []
# 0-length OK
BlockIndex(0, locs, lengths)
# also OK even though empty
BlockIndex(1, locs, lengths)
msg = "Block 0 extends beyond end"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [5], [10])
msg = "Block 0 overlaps"
with pytest.raises(ValueError, match=msg):
BlockIndex(10, [2, 5], [5, 3])
def test_to_int_index(self):
locs = [0, 10]
lengths = [4, 6]
exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
block = BlockIndex(20, locs, lengths)
dense = block.to_int_index()
tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
def test_to_block_index(self):
index = BlockIndex(10, [0, 5], [4, 5])
assert index.to_block_index() is index
class TestIntIndex:
def test_check_integrity(self):
# Too many indices than specified in self.length
msg = "Too many indices"
with pytest.raises(ValueError, match=msg):
IntIndex(length=1, indices=[1, 2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# No index can be negative.
msg = "No index can be less than zero"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, -2, 3])
# All indices must be less than the length.
msg = "All indices must be less than the length"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 5])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 2, 6])
# Indices must be strictly ascending.
msg = "Indices must be strictly increasing"
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 2])
with pytest.raises(ValueError, match=msg):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
idx = make_sparse_index(
4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
)
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_equals(self):
index = IntIndex(10, [0, 1, 2, 3, 4])
assert index.equals(index)
assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
def test_to_block_index(self, cases, test_length):
xloc, xlen, yloc, ylen, _, _ = cases
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
# see if survive the round trip
xbindex = xindex.to_int_index().to_block_index()
ybindex = yindex.to_int_index().to_block_index()
assert isinstance(xbindex, BlockIndex)
assert xbindex.equals(xindex)
assert ybindex.equals(yindex)
def test_to_int_index(self):
index = IntIndex(10, [2, 3, 4, 5, 6])
assert index.to_int_index() is index
class TestSparseOperators:
@pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
def test_op(self, opname, cases, test_length):
xloc, xlen, yloc, ylen, _, _ = cases
sparse_op = getattr(splib, f"sparse_{opname}_float64")
python_op = getattr(operator, opname)
xindex = BlockIndex(test_length, xloc, xlen)
yindex = BlockIndex(test_length, yloc, ylen)
xdindex = xindex.to_int_index()
ydindex = yindex.to_int_index()
x = np.arange(xindex.npoints) * 10.0 + 1
y = np.arange(yindex.npoints) * 100.0 + 1
xfill = 0
yfill = 2
result_block_vals, rb_index, bfill = sparse_op(
x, xindex, xfill, y, yindex, yfill
)
result_int_vals, ri_index, ifill = sparse_op(
x, xdindex, xfill, y, ydindex, yfill
)
assert rb_index.to_int_index().equals(ri_index)
tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
assert bfill == ifill
# check versus Series...
xseries = Series(x, xdindex.indices)
xseries = xseries.reindex(np.arange(test_length)).fillna(xfill)
yseries = Series(y, ydindex.indices)
yseries = yseries.reindex(np.arange(test_length)).fillna(yfill)
series_result = python_op(xseries, yseries)
series_result = series_result.reindex(ri_index.indices)
tm.assert_numpy_array_equal(result_block_vals, series_result.values)
tm.assert_numpy_array_equal(result_int_vals, series_result.values)

View File

@ -0,0 +1,306 @@
import numpy as np
import pytest
from pandas import (
NaT,
SparseDtype,
Timestamp,
isna,
)
from pandas.core.arrays.sparse import SparseArray
class TestReductions:
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_all(self, data, pos, neg):
# GH#17570
out = SparseArray(data).all()
assert out
out = SparseArray(data, fill_value=pos).all()
assert out
data[1] = neg
out = SparseArray(data).all()
assert not out
out = SparseArray(data, fill_value=pos).all()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([True, True, True], True, False),
([1, 2, 1], 1, 0),
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
def test_numpy_all(self, data, pos, neg):
# GH#17570
out = np.all(SparseArray(data))
assert out
out = np.all(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.all(SparseArray(data))
assert not out
out = np.all(SparseArray(data, fill_value=pos))
assert not out
# raises with a different message on py2.
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.all(SparseArray(data), out=np.array([]))
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_any(self, data, pos, neg):
# GH#17570
out = SparseArray(data).any()
assert out
out = SparseArray(data, fill_value=pos).any()
assert out
data[1] = neg
out = SparseArray(data).any()
assert not out
out = SparseArray(data, fill_value=pos).any()
assert not out
@pytest.mark.parametrize(
"data,pos,neg",
[
([False, True, False], True, False),
([0, 2, 0], 2, 0),
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
def test_numpy_any(self, data, pos, neg):
# GH#17570
out = np.any(SparseArray(data))
assert out
out = np.any(SparseArray(data, fill_value=pos))
assert out
data[1] = neg
out = np.any(SparseArray(data))
assert not out
out = np.any(SparseArray(data, fill_value=pos))
assert not out
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.any(SparseArray(data), out=out)
def test_sum(self):
data = np.arange(10).astype(float)
out = SparseArray(data).sum()
assert out == 45.0
data[5] = np.nan
out = SparseArray(data, fill_value=2).sum()
assert out == 40.0
out = SparseArray(data, fill_value=np.nan).sum()
assert out == 40.0
@pytest.mark.parametrize(
"arr",
[np.array([0, 1, np.nan, 1]), np.array([0, 1, 1])],
)
@pytest.mark.parametrize("fill_value", [0, 1, np.nan])
@pytest.mark.parametrize("min_count, expected", [(3, 2), (4, np.nan)])
def test_sum_min_count(self, arr, fill_value, min_count, expected):
# GH#25777
sparray = SparseArray(arr, fill_value=fill_value)
result = sparray.sum(min_count=min_count)
if np.isnan(expected):
assert np.isnan(result)
else:
assert result == expected
def test_bool_sum_min_count(self):
spar_bool = SparseArray([False, True] * 5, dtype=np.bool_, fill_value=True)
res = spar_bool.sum(min_count=1)
assert res == 5
res = spar_bool.sum(min_count=11)
assert isna(res)
def test_numpy_sum(self):
data = np.arange(10).astype(float)
out = np.sum(SparseArray(data))
assert out == 45.0
data[5] = np.nan
out = np.sum(SparseArray(data, fill_value=2))
assert out == 40.0
out = np.sum(SparseArray(data, fill_value=np.nan))
assert out == 40.0
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.sum(SparseArray(data), out=out)
def test_mean(self):
data = np.arange(10).astype(float)
out = SparseArray(data).mean()
assert out == 4.5
data[5] = np.nan
out = SparseArray(data).mean()
assert out == 40.0 / 9
def test_numpy_mean(self):
data = np.arange(10).astype(float)
out = np.mean(SparseArray(data))
assert out == 4.5
data[5] = np.nan
out = np.mean(SparseArray(data))
assert out == 40.0 / 9
msg = "the 'dtype' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), dtype=np.int64)
msg = "the 'out' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.mean(SparseArray(data), out=out)
class TestMinMax:
@pytest.mark.parametrize(
"raw_data,max_expected,min_expected",
[
(np.arange(5.0), [4], [0]),
(-np.arange(5.0), [0], [-4]),
(np.array([0, 1, 2, np.nan, 4]), [4], [0]),
(np.array([np.nan] * 5), [np.nan], [np.nan]),
(np.array([]), [np.nan], [np.nan]),
],
)
def test_nan_fill_value(self, raw_data, max_expected, min_expected):
arr = SparseArray(raw_data)
max_result = arr.max()
min_result = arr.min()
assert max_result in max_expected
assert min_result in min_expected
max_result = arr.max(skipna=False)
min_result = arr.min(skipna=False)
if np.isnan(raw_data).any():
assert np.isnan(max_result)
assert np.isnan(min_result)
else:
assert max_result in max_expected
assert min_result in min_expected
@pytest.mark.parametrize(
"fill_value,max_expected,min_expected",
[
(100, 100, 0),
(-100, 1, -100),
],
)
def test_fill_value(self, fill_value, max_expected, min_expected):
arr = SparseArray(
np.array([fill_value, 0, 1]), dtype=SparseDtype("int", fill_value)
)
max_result = arr.max()
assert max_result == max_expected
min_result = arr.min()
assert min_result == min_expected
def test_only_fill_value(self):
fv = 100
arr = SparseArray(np.array([fv, fv, fv]), dtype=SparseDtype("int", fv))
assert len(arr._valid_sp_values) == 0
assert arr.max() == fv
assert arr.min() == fv
assert arr.max(skipna=False) == fv
assert arr.min(skipna=False) == fv
@pytest.mark.parametrize("func", ["min", "max"])
@pytest.mark.parametrize("data", [np.array([]), np.array([np.nan, np.nan])])
@pytest.mark.parametrize(
"dtype,expected",
[
(SparseDtype(np.float64, np.nan), np.nan),
(SparseDtype(np.float64, 5.0), np.nan),
(SparseDtype("datetime64[ns]", NaT), NaT),
(SparseDtype("datetime64[ns]", Timestamp("2018-05-05")), NaT),
],
)
def test_na_value_if_no_valid_values(self, func, data, dtype, expected):
arr = SparseArray(data, dtype=dtype)
result = getattr(arr, func)()
if expected is NaT:
# TODO: pin down whether we wrap datetime64("NaT")
assert result is NaT or np.isnat(result)
else:
assert np.isnan(result)
class TestArgmaxArgmin:
@pytest.mark.parametrize(
"arr,argmax_expected,argmin_expected",
[
(SparseArray([1, 2, 0, 1, 2]), 1, 2),
(SparseArray([-1, -2, 0, -1, -2]), 2, 1),
(SparseArray([np.nan, 1, 0, 0, np.nan, -1]), 1, 5),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2]), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=-1), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=0), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=1), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=2), 5, 2),
(SparseArray([np.nan, 1, 0, 0, np.nan, 2], fill_value=3), 5, 2),
(SparseArray([0] * 10 + [-1], fill_value=0), 0, 10),
(SparseArray([0] * 10 + [-1], fill_value=-1), 0, 10),
(SparseArray([0] * 10 + [-1], fill_value=1), 0, 10),
(SparseArray([-1] + [0] * 10, fill_value=0), 1, 0),
(SparseArray([1] + [0] * 10, fill_value=0), 0, 1),
(SparseArray([-1] + [0] * 10, fill_value=-1), 1, 0),
(SparseArray([1] + [0] * 10, fill_value=1), 0, 1),
],
)
def test_argmax_argmin(self, arr, argmax_expected, argmin_expected):
argmax_result = arr.argmax()
argmin_result = arr.argmin()
assert argmax_result == argmax_expected
assert argmin_result == argmin_expected
@pytest.mark.parametrize(
"arr,method",
[(SparseArray([]), "argmax"), (SparseArray([]), "argmin")],
)
def test_empty_array(self, arr, method):
msg = f"attempt to get {method} of an empty sequence"
with pytest.raises(ValueError, match=msg):
arr.argmax() if method == "argmax" else arr.argmin()

View File

@ -0,0 +1,79 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import SparseArray
@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
@pytest.mark.parametrize("fill_value", [0, np.nan])
@pytest.mark.parametrize("op", [operator.pos, operator.neg])
def test_unary_op(op, fill_value):
arr = np.array([0, 1, np.nan, 2])
sparray = SparseArray(arr, fill_value=fill_value)
result = op(sparray)
expected = SparseArray(op(arr), fill_value=op(fill_value))
tm.assert_sp_array_equal(result, expected)
@pytest.mark.parametrize("fill_value", [True, False])
def test_invert(fill_value):
arr = np.array([True, False, False, True])
sparray = SparseArray(arr, fill_value=fill_value)
result = ~sparray
expected = SparseArray(~arr, fill_value=not fill_value)
tm.assert_sp_array_equal(result, expected)
result = ~pd.Series(sparray)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
result = ~pd.DataFrame({"A": sparray})
expected = pd.DataFrame({"A": expected})
tm.assert_frame_equal(result, expected)
class TestUnaryMethods:
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in cast:RuntimeWarning"
)
def test_neg_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, np.nan, -3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = -arr
exp = SparseArray([1, 2, -1, -3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in cast:RuntimeWarning"
)
def test_abs_operator(self):
arr = SparseArray([-1, -2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, np.nan, 3], fill_value=np.nan, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([-1, -2, 1, 3], fill_value=-1, dtype=np.int8)
res = abs(arr)
exp = SparseArray([1, 2, 1, 3], fill_value=1, dtype=np.int8)
tm.assert_sp_array_equal(exp, res)
def test_invert_operator(self):
arr = SparseArray([False, True, False, True], fill_value=False, dtype=np.bool_)
exp = SparseArray(
np.invert([False, True, False, True]), fill_value=True, dtype=np.bool_
)
res = ~arr
tm.assert_sp_array_equal(exp, res)
arr = SparseArray([0, 1, 0, 2, 3, 0], fill_value=0, dtype=np.int32)
res = ~arr
exp = SparseArray([-1, -2, -1, -3, -4, -1], fill_value=-1, dtype=np.int32)
tm.assert_sp_array_equal(exp, res)