This commit is contained in:
2024-12-04 13:35:57 +05:00
parent d346bf4b2a
commit 73ce681a55
7059 changed files with 1196501 additions and 0 deletions

View File

@ -0,0 +1,6 @@
from pandas.tests.extension.array_with_attr.array import (
FloatAttrArray,
FloatAttrDtype,
)
__all__ = ["FloatAttrArray", "FloatAttrDtype"]

View File

@ -0,0 +1,89 @@
"""
Test extension array that has custom attribute information (not stored on the dtype).
"""
from __future__ import annotations
import numbers
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
import pandas as pd
from pandas.core.arrays import ExtensionArray
if TYPE_CHECKING:
from pandas._typing import type_t
class FloatAttrDtype(ExtensionDtype):
type = float
name = "float_attr"
na_value = np.nan
@classmethod
def construct_array_type(cls) -> type_t[FloatAttrArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatAttrArray
class FloatAttrArray(ExtensionArray):
dtype = FloatAttrDtype()
__array_priority__ = 1000
def __init__(self, values, attr=None) -> None:
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array of float64 dtype as values")
if not values.dtype == "float64":
raise TypeError("Need to pass a numpy array of float64 dtype as values")
self.data = values
self.attr = attr
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
if not copy:
data = np.asarray(scalars, dtype="float64")
else:
data = np.array(scalars, dtype="float64", copy=copy)
return cls(data)
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self.data[item], self.attr)
def __len__(self) -> int:
return len(self.data)
def isna(self):
return np.isnan(self.data)
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take
data = self.data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return type(self)(result, self.attr)
def copy(self):
return type(self)(self.data.copy(), self.attr)
@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
attr = to_concat[0].attr if len(to_concat) else None
return cls(data, attr)

View File

@ -0,0 +1,33 @@
import numpy as np
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.array_with_attr import FloatAttrArray
def test_concat_with_all_na():
# https://github.com/pandas-dev/pandas/pull/47762
# ensure that attribute of the column array is preserved (when it gets
# preserved in reindexing the array) during merge/concat
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"
result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
expected = pd.DataFrame(
{"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

View File

@ -0,0 +1,131 @@
"""
Base test suite for extension arrays.
These tests are intended for third-party libraries to subclass to validate
that their extension arrays and dtypes satisfy the interface. Moving or
renaming the tests should not be done lightly.
Libraries are expected to implement a few pytest fixtures to provide data
for the tests. The fixtures may be located in either
* The same module as your test class.
* A ``conftest.py`` in the same directory as your test class.
The full list of fixtures may be found in the ``conftest.py`` next to this
file.
.. code-block:: python
import pytest
from pandas.tests.extension.base import BaseDtypeTests
@pytest.fixture
def dtype():
return MyDtype()
class TestMyDtype(BaseDtypeTests):
pass
Your class ``TestDtype`` will inherit all the tests defined on
``BaseDtypeTests``. pytest's fixture discover will supply your ``dtype``
wherever the test requires it. You're free to implement additional tests.
"""
from pandas.tests.extension.base.accumulate import BaseAccumulateTests
from pandas.tests.extension.base.casting import BaseCastingTests
from pandas.tests.extension.base.constructors import BaseConstructorsTests
from pandas.tests.extension.base.dim2 import ( # noqa: F401
Dim2CompatTests,
NDArrayBacked2DTests,
)
from pandas.tests.extension.base.dtype import BaseDtypeTests
from pandas.tests.extension.base.getitem import BaseGetitemTests
from pandas.tests.extension.base.groupby import BaseGroupbyTests
from pandas.tests.extension.base.index import BaseIndexTests
from pandas.tests.extension.base.interface import BaseInterfaceTests
from pandas.tests.extension.base.io import BaseParsingTests
from pandas.tests.extension.base.methods import BaseMethodsTests
from pandas.tests.extension.base.missing import BaseMissingTests
from pandas.tests.extension.base.ops import ( # noqa: F401
BaseArithmeticOpsTests,
BaseComparisonOpsTests,
BaseOpsUtil,
BaseUnaryOpsTests,
)
from pandas.tests.extension.base.printing import BasePrintingTests
from pandas.tests.extension.base.reduce import BaseReduceTests
from pandas.tests.extension.base.reshaping import BaseReshapingTests
from pandas.tests.extension.base.setitem import BaseSetitemTests
# One test class that you can inherit as an alternative to inheriting all the
# test classes above.
# Note 1) this excludes Dim2CompatTests and NDArrayBacked2DTests.
# Note 2) this uses BaseReduceTests and and _not_ BaseBooleanReduceTests,
# BaseNoReduceTests, or BaseNumericReduceTests
class ExtensionTests(
BaseAccumulateTests,
BaseCastingTests,
BaseConstructorsTests,
BaseDtypeTests,
BaseGetitemTests,
BaseGroupbyTests,
BaseIndexTests,
BaseInterfaceTests,
BaseParsingTests,
BaseMethodsTests,
BaseMissingTests,
BaseArithmeticOpsTests,
BaseComparisonOpsTests,
BaseUnaryOpsTests,
BasePrintingTests,
BaseReduceTests,
BaseReshapingTests,
BaseSetitemTests,
Dim2CompatTests,
):
pass
def __getattr__(name: str):
import warnings
if name == "BaseNoReduceTests":
warnings.warn(
"BaseNoReduceTests is deprecated and will be removed in a "
"future version. Use BaseReduceTests and override "
"`_supports_reduction` instead.",
FutureWarning,
)
from pandas.tests.extension.base.reduce import BaseNoReduceTests
return BaseNoReduceTests
elif name == "BaseNumericReduceTests":
warnings.warn(
"BaseNumericReduceTests is deprecated and will be removed in a "
"future version. Use BaseReduceTests and override "
"`_supports_reduction` instead.",
FutureWarning,
)
from pandas.tests.extension.base.reduce import BaseNumericReduceTests
return BaseNumericReduceTests
elif name == "BaseBooleanReduceTests":
warnings.warn(
"BaseBooleanReduceTests is deprecated and will be removed in a "
"future version. Use BaseReduceTests and override "
"`_supports_reduction` instead.",
FutureWarning,
)
from pandas.tests.extension.base.reduce import BaseBooleanReduceTests
return BaseBooleanReduceTests
raise AttributeError(
f"module 'pandas.tests.extension.base' has no attribute '{name}'"
)

View File

@ -0,0 +1,39 @@
import pytest
import pandas as pd
import pandas._testing as tm
class BaseAccumulateTests:
"""
Accumulation specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
# Do we expect this accumulation to be supported for this dtype?
# We default to assuming "no"; subclass authors should override here.
return False
def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
try:
alt = ser.astype("float64")
except TypeError:
# e.g. Period can't be cast to float64
alt = ser.astype(object)
result = getattr(ser, op_name)(skipna=skipna)
expected = getattr(alt, op_name)(skipna=skipna)
tm.assert_series_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize("skipna", [True, False])
def test_accumulate_series(self, data, all_numeric_accumulations, skipna):
op_name = all_numeric_accumulations
ser = pd.Series(data)
if self._supports_accumulation(ser, op_name):
self.check_accumulate(ser, op_name, skipna)
else:
with pytest.raises((NotImplementedError, TypeError)):
# TODO: require TypeError for things that will _never_ work?
getattr(ser, op_name)(skipna=skipna)

View File

@ -0,0 +1,2 @@
class BaseExtensionTests:
pass

View File

@ -0,0 +1,87 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
from pandas.core.internals.blocks import NumpyBlock
class BaseCastingTests:
"""Casting to and from ExtensionDtypes"""
def test_astype_object_series(self, all_data):
ser = pd.Series(all_data, name="A")
result = ser.astype(object)
assert result.dtype == np.dtype(object)
if hasattr(result._mgr, "blocks"):
blk = result._mgr.blocks[0]
assert isinstance(blk, NumpyBlock)
assert blk.is_object
assert isinstance(result._mgr.array, np.ndarray)
assert result._mgr.array.dtype == np.dtype(object)
def test_astype_object_frame(self, all_data):
df = pd.DataFrame({"A": all_data})
result = df.astype(object)
if hasattr(result._mgr, "blocks"):
blk = result._mgr.blocks[0]
assert isinstance(blk, NumpyBlock), type(blk)
assert blk.is_object
assert isinstance(result._mgr.arrays[0], np.ndarray)
assert result._mgr.arrays[0].dtype == np.dtype(object)
# check that we can compare the dtypes
comp = result.dtypes == df.dtypes
assert not comp.any()
def test_tolist(self, data):
result = pd.Series(data).tolist()
expected = list(data)
assert result == expected
def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
expected = pd.Series([str(x) for x in data[:5]], dtype=str)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"nullable_string_dtype",
[
"string[python]",
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_astype_string(self, data, nullable_string_dtype):
# GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj)
result = pd.Series(data[:5]).astype(nullable_string_dtype)
expected = pd.Series(
[str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]],
dtype=nullable_string_dtype,
)
tm.assert_series_equal(result, expected)
def test_to_numpy(self, data):
expected = np.asarray(data)
result = data.to_numpy()
tm.assert_equal(result, expected)
result = pd.Series(data).to_numpy()
tm.assert_equal(result, expected)
def test_astype_empty_dataframe(self, dtype):
# https://github.com/pandas-dev/pandas/issues/33113
df = pd.DataFrame()
result = df.astype(dtype)
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("copy", [True, False])
def test_astype_own_type(self, data, copy):
# ensure that astype returns the original object for equal dtype and copy=False
# https://github.com/pandas-dev/pandas/issues/28488
result = data.astype(data.dtype, copy=copy)
assert (result is data) is (not copy)
tm.assert_extension_array_equal(result, data)

View File

@ -0,0 +1,142 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import ExtensionArray
from pandas.core.internals.blocks import EABackedBlock
class BaseConstructorsTests:
def test_from_sequence_from_cls(self, data):
result = type(data)._from_sequence(data, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)
data = data[:0]
result = type(data)._from_sequence(data, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)
def test_array_from_scalars(self, data):
scalars = [data[0], data[1], data[2]]
result = data._from_sequence(scalars, dtype=data.dtype)
assert isinstance(result, type(data))
def test_series_constructor(self, data):
result = pd.Series(data, copy=False)
assert result.dtype == data.dtype
assert len(result) == len(data)
if hasattr(result._mgr, "blocks"):
assert isinstance(result._mgr.blocks[0], EABackedBlock)
assert result._mgr.array is data
# Series[EA] is unboxed / boxed correctly
result2 = pd.Series(result)
assert result2.dtype == data.dtype
if hasattr(result._mgr, "blocks"):
assert isinstance(result2._mgr.blocks[0], EABackedBlock)
def test_series_constructor_no_data_with_index(self, dtype, na_value):
result = pd.Series(index=[1, 2, 3], dtype=dtype)
expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
# GH 33559 - empty index
result = pd.Series(index=[], dtype=dtype)
expected = pd.Series([], index=pd.Index([], dtype="object"), dtype=dtype)
tm.assert_series_equal(result, expected)
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
result = pd.Series(na_value, index=[1, 2, 3], dtype=dtype)
expected = pd.Series([na_value] * 3, index=[1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
def test_series_constructor_scalar_with_index(self, data, dtype):
scalar = data[0]
result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype)
expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype)
tm.assert_series_equal(result, expected)
result = pd.Series(scalar, index=["foo"], dtype=dtype)
expected = pd.Series([scalar], index=["foo"], dtype=dtype)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("from_series", [True, False])
def test_dataframe_constructor_from_dict(self, data, from_series):
if from_series:
data = pd.Series(data)
result = pd.DataFrame({"A": data})
assert result.dtypes["A"] == data.dtype
assert result.shape == (len(data), 1)
if hasattr(result._mgr, "blocks"):
assert isinstance(result._mgr.blocks[0], EABackedBlock)
assert isinstance(result._mgr.arrays[0], ExtensionArray)
def test_dataframe_from_series(self, data):
result = pd.DataFrame(pd.Series(data))
assert result.dtypes[0] == data.dtype
assert result.shape == (len(data), 1)
if hasattr(result._mgr, "blocks"):
assert isinstance(result._mgr.blocks[0], EABackedBlock)
assert isinstance(result._mgr.arrays[0], ExtensionArray)
def test_series_given_mismatched_index_raises(self, data):
msg = r"Length of values \(3\) does not match length of index \(5\)"
with pytest.raises(ValueError, match=msg):
pd.Series(data[:3], index=[0, 1, 2, 3, 4])
def test_from_dtype(self, data):
# construct from our dtype & string dtype
dtype = data.dtype
expected = pd.Series(data)
result = pd.Series(list(data), dtype=dtype)
tm.assert_series_equal(result, expected)
result = pd.Series(list(data), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# gh-30280
expected = pd.DataFrame(data).astype(dtype)
result = pd.DataFrame(list(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
result = pd.DataFrame(list(data), dtype=str(dtype))
tm.assert_frame_equal(result, expected)
def test_pandas_array(self, data):
# pd.array(extension_array) should be idempotent...
result = pd.array(data)
tm.assert_extension_array_equal(result, data)
def test_pandas_array_dtype(self, data):
# ... but specifying dtype will override idempotency
result = pd.array(data, dtype=np.dtype(object))
expected = pd.arrays.NumpyExtensionArray(np.asarray(data, dtype=object))
tm.assert_equal(result, expected)
def test_construct_empty_dataframe(self, dtype):
# GH 33623
result = pd.DataFrame(columns=["a"], dtype=dtype)
expected = pd.DataFrame(
{"a": pd.array([], dtype=dtype)}, index=pd.RangeIndex(0)
)
tm.assert_frame_equal(result, expected)
def test_empty(self, dtype):
cls = dtype.construct_array_type()
result = cls._empty((4,), dtype=dtype)
assert isinstance(result, cls)
assert result.dtype == dtype
assert result.shape == (4,)
# GH#19600 method on ExtensionDtype
result2 = dtype.empty((4,))
assert isinstance(result2, cls)
assert result2.dtype == dtype
assert result2.shape == (4,)
result2 = dtype.empty(4)
assert isinstance(result2, cls)
assert result2.dtype == dtype
assert result2.shape == (4,)

View File

@ -0,0 +1,345 @@
"""
Tests for 2D compatibility.
"""
import numpy as np
import pytest
from pandas._libs.missing import is_matching_na
from pandas.core.dtypes.common import (
is_bool_dtype,
is_integer_dtype,
)
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.integer import NUMPY_INT_TO_DTYPE
class Dim2CompatTests:
# Note: these are ONLY for ExtensionArray subclasses that support 2D arrays.
# i.e. not for pyarrow-backed EAs.
@pytest.fixture(autouse=True)
def skip_if_doesnt_support_2d(self, dtype, request):
if not dtype._supports_2d:
node = request.node
# In cases where we are mixed in to ExtensionTests, we only want to
# skip tests that are defined in Dim2CompatTests
test_func = node._obj
if test_func.__qualname__.startswith("Dim2CompatTests"):
# TODO: is there a less hacky way of checking this?
pytest.skip(f"{dtype} does not support 2D.")
def test_transpose(self, data):
arr2d = data.repeat(2).reshape(-1, 2)
shape = arr2d.shape
assert shape[0] != shape[-1] # otherwise the rest of the test is useless
assert arr2d.T.shape == shape[::-1]
def test_frame_from_2d_array(self, data):
arr2d = data.repeat(2).reshape(-1, 2)
df = pd.DataFrame(arr2d)
expected = pd.DataFrame({0: arr2d[:, 0], 1: arr2d[:, 1]})
tm.assert_frame_equal(df, expected)
def test_swapaxes(self, data):
arr2d = data.repeat(2).reshape(-1, 2)
result = arr2d.swapaxes(0, 1)
expected = arr2d.T
tm.assert_extension_array_equal(result, expected)
def test_delete_2d(self, data):
arr2d = data.repeat(3).reshape(-1, 3)
# axis = 0
result = arr2d.delete(1, axis=0)
expected = data.delete(1).repeat(3).reshape(-1, 3)
tm.assert_extension_array_equal(result, expected)
# axis = 1
result = arr2d.delete(1, axis=1)
expected = data.repeat(2).reshape(-1, 2)
tm.assert_extension_array_equal(result, expected)
def test_take_2d(self, data):
arr2d = data.reshape(-1, 1)
result = arr2d.take([0, 0, -1], axis=0)
expected = data.take([0, 0, -1]).reshape(-1, 1)
tm.assert_extension_array_equal(result, expected)
def test_repr_2d(self, data):
# this could fail in a corner case where an element contained the name
res = repr(data.reshape(1, -1))
assert res.count(f"<{type(data).__name__}") == 1
res = repr(data.reshape(-1, 1))
assert res.count(f"<{type(data).__name__}") == 1
def test_reshape(self, data):
arr2d = data.reshape(-1, 1)
assert arr2d.shape == (data.size, 1)
assert len(arr2d) == len(data)
arr2d = data.reshape((-1, 1))
assert arr2d.shape == (data.size, 1)
assert len(arr2d) == len(data)
with pytest.raises(ValueError):
data.reshape((data.size, 2))
with pytest.raises(ValueError):
data.reshape(data.size, 2)
def test_getitem_2d(self, data):
arr2d = data.reshape(1, -1)
result = arr2d[0]
tm.assert_extension_array_equal(result, data)
with pytest.raises(IndexError):
arr2d[1]
with pytest.raises(IndexError):
arr2d[-2]
result = arr2d[:]
tm.assert_extension_array_equal(result, arr2d)
result = arr2d[:, :]
tm.assert_extension_array_equal(result, arr2d)
result = arr2d[:, 0]
expected = data[[0]]
tm.assert_extension_array_equal(result, expected)
# dimension-expanding getitem on 1D
result = data[:, np.newaxis]
tm.assert_extension_array_equal(result, arr2d.T)
def test_iter_2d(self, data):
arr2d = data.reshape(1, -1)
objs = list(iter(arr2d))
assert len(objs) == arr2d.shape[0]
for obj in objs:
assert isinstance(obj, type(data))
assert obj.dtype == data.dtype
assert obj.ndim == 1
assert len(obj) == arr2d.shape[1]
def test_tolist_2d(self, data):
arr2d = data.reshape(1, -1)
result = arr2d.tolist()
expected = [data.tolist()]
assert isinstance(result, list)
assert all(isinstance(x, list) for x in result)
assert result == expected
def test_concat_2d(self, data):
left = type(data)._concat_same_type([data, data]).reshape(-1, 2)
right = left.copy()
# axis=0
result = left._concat_same_type([left, right], axis=0)
expected = data._concat_same_type([data] * 4).reshape(-1, 2)
tm.assert_extension_array_equal(result, expected)
# axis=1
result = left._concat_same_type([left, right], axis=1)
assert result.shape == (len(data), 4)
tm.assert_extension_array_equal(result[:, :2], left)
tm.assert_extension_array_equal(result[:, 2:], right)
# axis > 1 -> invalid
msg = "axis 2 is out of bounds for array of dimension 2"
with pytest.raises(ValueError, match=msg):
left._concat_same_type([left, right], axis=2)
@pytest.mark.parametrize("method", ["backfill", "pad"])
def test_fillna_2d_method(self, data_missing, method):
# pad_or_backfill is always along axis=0
arr = data_missing.repeat(2).reshape(2, 2)
assert arr[0].isna().all()
assert not arr[1].isna().any()
result = arr._pad_or_backfill(method=method, limit=None)
expected = data_missing._pad_or_backfill(method=method).repeat(2).reshape(2, 2)
tm.assert_extension_array_equal(result, expected)
# Reverse so that backfill is not a no-op.
arr2 = arr[::-1]
assert not arr2[0].isna().any()
assert arr2[1].isna().all()
result2 = arr2._pad_or_backfill(method=method, limit=None)
expected2 = (
data_missing[::-1]._pad_or_backfill(method=method).repeat(2).reshape(2, 2)
)
tm.assert_extension_array_equal(result2, expected2)
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
def test_reductions_2d_axis_none(self, data, method):
arr2d = data.reshape(1, -1)
err_expected = None
err_result = None
try:
expected = getattr(data, method)()
except Exception as err:
# if the 1D reduction is invalid, the 2D reduction should be as well
err_expected = err
try:
result = getattr(arr2d, method)(axis=None)
except Exception as err2:
err_result = err2
else:
result = getattr(arr2d, method)(axis=None)
if err_result is not None or err_expected is not None:
assert type(err_result) == type(err_expected)
return
assert is_matching_na(result, expected) or result == expected
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
@pytest.mark.parametrize("min_count", [0, 1])
def test_reductions_2d_axis0(self, data, method, min_count):
if min_count == 1 and method not in ["sum", "prod"]:
pytest.skip(f"min_count not relevant for {method}")
arr2d = data.reshape(1, -1)
kwargs = {}
if method in ["std", "var"]:
# pass ddof=0 so we get all-zero std instead of all-NA std
kwargs["ddof"] = 0
elif method in ["prod", "sum"]:
kwargs["min_count"] = min_count
try:
result = getattr(arr2d, method)(axis=0, **kwargs)
except Exception as err:
try:
getattr(data, method)()
except Exception as err2:
assert type(err) == type(err2)
return
else:
raise AssertionError("Both reductions should raise or neither")
def get_reduction_result_dtype(dtype):
# windows and 32bit builds will in some cases have int32/uint32
# where other builds will have int64/uint64.
if dtype.itemsize == 8:
return dtype
elif dtype.kind in "ib":
return NUMPY_INT_TO_DTYPE[np.dtype(int)]
else:
# i.e. dtype.kind == "u"
return NUMPY_INT_TO_DTYPE[np.dtype("uint")]
if method in ["sum", "prod"]:
# std and var are not dtype-preserving
expected = data
if data.dtype.kind in "iub":
dtype = get_reduction_result_dtype(data.dtype)
expected = data.astype(dtype)
assert dtype == expected.dtype
if min_count == 0:
fill_value = 1 if method == "prod" else 0
expected = expected.fillna(fill_value)
tm.assert_extension_array_equal(result, expected)
elif method == "median":
# std and var are not dtype-preserving
expected = data
tm.assert_extension_array_equal(result, expected)
elif method in ["mean", "std", "var"]:
if is_integer_dtype(data) or is_bool_dtype(data):
data = data.astype("Float64")
if method == "mean":
tm.assert_extension_array_equal(result, data)
else:
tm.assert_extension_array_equal(result, data - data)
@pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"])
def test_reductions_2d_axis1(self, data, method):
arr2d = data.reshape(1, -1)
try:
result = getattr(arr2d, method)(axis=1)
except Exception as err:
try:
getattr(data, method)()
except Exception as err2:
assert type(err) == type(err2)
return
else:
raise AssertionError("Both reductions should raise or neither")
# not necessarily type/dtype-preserving, so weaker assertions
assert result.shape == (1,)
expected_scalar = getattr(data, method)()
res = result[0]
assert is_matching_na(res, expected_scalar) or res == expected_scalar
class NDArrayBacked2DTests(Dim2CompatTests):
# More specific tests for NDArrayBackedExtensionArray subclasses
def test_copy_order(self, data):
# We should be matching numpy semantics for the "order" keyword in 'copy'
arr2d = data.repeat(2).reshape(-1, 2)
assert arr2d._ndarray.flags["C_CONTIGUOUS"]
res = arr2d.copy()
assert res._ndarray.flags["C_CONTIGUOUS"]
res = arr2d[::2, ::2].copy()
assert res._ndarray.flags["C_CONTIGUOUS"]
res = arr2d.copy("F")
assert not res._ndarray.flags["C_CONTIGUOUS"]
assert res._ndarray.flags["F_CONTIGUOUS"]
res = arr2d.copy("K")
assert res._ndarray.flags["C_CONTIGUOUS"]
res = arr2d.T.copy("K")
assert not res._ndarray.flags["C_CONTIGUOUS"]
assert res._ndarray.flags["F_CONTIGUOUS"]
# order not accepted by numpy
msg = r"order must be one of 'C', 'F', 'A', or 'K' \(got 'Q'\)"
with pytest.raises(ValueError, match=msg):
arr2d.copy("Q")
# neither contiguity
arr_nc = arr2d[::2]
assert not arr_nc._ndarray.flags["C_CONTIGUOUS"]
assert not arr_nc._ndarray.flags["F_CONTIGUOUS"]
assert arr_nc.copy()._ndarray.flags["C_CONTIGUOUS"]
assert not arr_nc.copy()._ndarray.flags["F_CONTIGUOUS"]
assert arr_nc.copy("C")._ndarray.flags["C_CONTIGUOUS"]
assert not arr_nc.copy("C")._ndarray.flags["F_CONTIGUOUS"]
assert not arr_nc.copy("F")._ndarray.flags["C_CONTIGUOUS"]
assert arr_nc.copy("F")._ndarray.flags["F_CONTIGUOUS"]
assert arr_nc.copy("K")._ndarray.flags["C_CONTIGUOUS"]
assert not arr_nc.copy("K")._ndarray.flags["F_CONTIGUOUS"]

View File

@ -0,0 +1,123 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import (
infer_dtype,
is_object_dtype,
is_string_dtype,
)
class BaseDtypeTests:
"""Base class for ExtensionDtype classes"""
def test_name(self, dtype):
assert isinstance(dtype.name, str)
def test_kind(self, dtype):
valid = set("biufcmMOSUV")
assert dtype.kind in valid
def test_is_dtype_from_name(self, dtype):
result = type(dtype).is_dtype(dtype.name)
assert result is True
def test_is_dtype_unboxes_dtype(self, data, dtype):
assert dtype.is_dtype(data) is True
def test_is_dtype_from_self(self, dtype):
result = type(dtype).is_dtype(dtype)
assert result is True
def test_is_dtype_other_input(self, dtype):
assert dtype.is_dtype([1, 2, 3]) is False
def test_is_not_string_type(self, dtype):
assert not is_string_dtype(dtype)
def test_is_not_object_type(self, dtype):
assert not is_object_dtype(dtype)
def test_eq_with_str(self, dtype):
assert dtype == dtype.name
assert dtype != dtype.name + "-suffix"
def test_eq_with_numpy_object(self, dtype):
assert dtype != np.dtype("object")
def test_eq_with_self(self, dtype):
assert dtype == dtype
assert dtype != object()
def test_array_type(self, data, dtype):
assert dtype.construct_array_type() is type(data)
def test_check_dtype(self, data):
dtype = data.dtype
# check equivalency for using .dtypes
df = pd.DataFrame(
{
"A": pd.Series(data, dtype=dtype),
"B": data,
"C": pd.Series(["foo"] * len(data), dtype=object),
"D": 1,
}
)
result = df.dtypes == str(dtype)
assert np.dtype("int64") != "Int64"
expected = pd.Series([True, True, False, False], index=list("ABCD"))
tm.assert_series_equal(result, expected)
expected = pd.Series([True, True, False, False], index=list("ABCD"))
result = df.dtypes.apply(str) == str(dtype)
tm.assert_series_equal(result, expected)
def test_hashable(self, dtype):
hash(dtype) # no error
def test_str(self, dtype):
assert str(dtype) == dtype.name
def test_eq(self, dtype):
assert dtype == dtype.name
assert dtype != "anonther_type"
def test_construct_from_string_own_name(self, dtype):
result = dtype.construct_from_string(dtype.name)
assert type(result) is type(dtype)
# check OK as classmethod
result = type(dtype).construct_from_string(dtype.name)
assert type(result) is type(dtype)
def test_construct_from_string_another_type_raises(self, dtype):
msg = f"Cannot construct a '{type(dtype).__name__}' from 'another_type'"
with pytest.raises(TypeError, match=msg):
type(dtype).construct_from_string("another_type")
def test_construct_from_string_wrong_type_raises(self, dtype):
with pytest.raises(
TypeError,
match="'construct_from_string' expects a string, got <class 'int'>",
):
type(dtype).construct_from_string(0)
def test_get_common_dtype(self, dtype):
# in practice we will not typically call this with a 1-length list
# (we shortcut to just use that dtype as the common dtype), but
# still testing as good practice to have this working (and it is the
# only case we can test in general)
assert dtype._get_common_dtype([dtype]) == dtype
@pytest.mark.parametrize("skipna", [True, False])
def test_infer_dtype(self, data, data_missing, skipna):
# only testing that this works without raising an error
res = infer_dtype(data, skipna=skipna)
assert isinstance(res, str)
res = infer_dtype(data_missing, skipna=skipna)
assert isinstance(res, str)

View File

@ -0,0 +1,469 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class BaseGetitemTests:
"""Tests for ExtensionArray.__getitem__."""
def test_iloc_series(self, data):
ser = pd.Series(data)
result = ser.iloc[:4]
expected = pd.Series(data[:4])
tm.assert_series_equal(result, expected)
result = ser.iloc[[0, 1, 2, 3]]
tm.assert_series_equal(result, expected)
def test_iloc_frame(self, data):
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.iloc[:4, [0]]
tm.assert_frame_equal(result, expected)
# sequence -> frame
result = df.iloc[[0, 1, 2, 3], [0]]
tm.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name="A")
# slice -> series
result = df.iloc[:4, 0]
tm.assert_series_equal(result, expected)
# sequence -> series
result = df.iloc[:4, 0]
tm.assert_series_equal(result, expected)
# GH#32959 slice columns with step
result = df.iloc[:, ::2]
tm.assert_frame_equal(result, df[["A"]])
result = df[["B", "A"]].iloc[:, ::2]
tm.assert_frame_equal(result, df[["B"]])
def test_iloc_frame_single_block(self, data):
# GH#32959 null slice along index, slice along columns with single-block
df = pd.DataFrame({"A": data})
result = df.iloc[:, :]
tm.assert_frame_equal(result, df)
result = df.iloc[:, :1]
tm.assert_frame_equal(result, df)
result = df.iloc[:, :2]
tm.assert_frame_equal(result, df)
result = df.iloc[:, ::2]
tm.assert_frame_equal(result, df)
result = df.iloc[:, 1:2]
tm.assert_frame_equal(result, df.iloc[:, :0])
result = df.iloc[:, -1:]
tm.assert_frame_equal(result, df)
def test_loc_series(self, data):
ser = pd.Series(data)
result = ser.loc[:3]
expected = pd.Series(data[:4])
tm.assert_series_equal(result, expected)
result = ser.loc[[0, 1, 2, 3]]
tm.assert_series_equal(result, expected)
def test_loc_frame(self, data):
df = pd.DataFrame({"A": data, "B": np.arange(len(data), dtype="int64")})
expected = pd.DataFrame({"A": data[:4]})
# slice -> frame
result = df.loc[:3, ["A"]]
tm.assert_frame_equal(result, expected)
# sequence -> frame
result = df.loc[[0, 1, 2, 3], ["A"]]
tm.assert_frame_equal(result, expected)
expected = pd.Series(data[:4], name="A")
# slice -> series
result = df.loc[:3, "A"]
tm.assert_series_equal(result, expected)
# sequence -> series
result = df.loc[:3, "A"]
tm.assert_series_equal(result, expected)
def test_loc_iloc_frame_single_dtype(self, data):
# GH#27110 bug in ExtensionBlock.iget caused df.iloc[n] to incorrectly
# return a scalar
df = pd.DataFrame({"A": data})
expected = pd.Series([data[2]], index=["A"], name=2, dtype=data.dtype)
result = df.loc[2]
tm.assert_series_equal(result, expected)
expected = pd.Series(
[data[-1]], index=["A"], name=len(data) - 1, dtype=data.dtype
)
result = df.iloc[-1]
tm.assert_series_equal(result, expected)
def test_getitem_scalar(self, data):
result = data[0]
assert isinstance(result, data.dtype.type)
result = pd.Series(data)[0]
assert isinstance(result, data.dtype.type)
def test_getitem_invalid(self, data):
# TODO: box over scalar, [scalar], (scalar,)?
msg = (
r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis "
r"\(`None`\) and integer or boolean arrays are valid indices"
)
with pytest.raises(IndexError, match=msg):
data["foo"]
with pytest.raises(IndexError, match=msg):
data[2.5]
ub = len(data)
msg = "|".join(
[
"list index out of range", # json
"index out of bounds", # pyarrow
"Out of bounds access", # Sparse
f"loc must be an integer between -{ub} and {ub}", # Sparse
f"index {ub+1} is out of bounds for axis 0 with size {ub}",
f"index -{ub+1} is out of bounds for axis 0 with size {ub}",
]
)
with pytest.raises(IndexError, match=msg):
data[ub + 1]
with pytest.raises(IndexError, match=msg):
data[-ub - 1]
def test_getitem_scalar_na(self, data_missing, na_cmp, na_value):
result = data_missing[0]
assert na_cmp(result, na_value)
def test_getitem_empty(self, data):
# Indexing with empty list
result = data[[]]
assert len(result) == 0
assert isinstance(result, type(data))
expected = data[np.array([], dtype="int64")]
tm.assert_extension_array_equal(result, expected)
def test_getitem_mask(self, data):
# Empty mask, raw array
mask = np.zeros(len(data), dtype=bool)
result = data[mask]
assert len(result) == 0
assert isinstance(result, type(data))
# Empty mask, in series
mask = np.zeros(len(data), dtype=bool)
result = pd.Series(data)[mask]
assert len(result) == 0
assert result.dtype == data.dtype
# non-empty mask, raw array
mask[0] = True
result = data[mask]
assert len(result) == 1
assert isinstance(result, type(data))
# non-empty mask, in series
result = pd.Series(data)[mask]
assert len(result) == 1
assert result.dtype == data.dtype
def test_getitem_mask_raises(self, data):
mask = np.array([True, False])
msg = f"Boolean index has wrong length: 2 instead of {len(data)}"
with pytest.raises(IndexError, match=msg):
data[mask]
mask = pd.array(mask, dtype="boolean")
with pytest.raises(IndexError, match=msg):
data[mask]
def test_getitem_boolean_array_mask(self, data):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
result = data[mask]
assert len(result) == 0
assert isinstance(result, type(data))
result = pd.Series(data)[mask]
assert len(result) == 0
assert result.dtype == data.dtype
mask[:5] = True
expected = data.take([0, 1, 2, 3, 4])
result = data[mask]
tm.assert_extension_array_equal(result, expected)
expected = pd.Series(expected)
result = pd.Series(data)[mask]
tm.assert_series_equal(result, expected)
def test_getitem_boolean_na_treated_as_false(self, data):
# https://github.com/pandas-dev/pandas/issues/31503
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:2] = pd.NA
mask[2:4] = True
result = data[mask]
expected = data[mask.fillna(False)]
tm.assert_extension_array_equal(result, expected)
s = pd.Series(data)
result = s[mask]
expected = s[mask.fillna(False)]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_getitem_integer_array(self, data, idx):
result = data[idx]
assert len(result) == 3
assert isinstance(result, type(data))
expected = data.take([0, 1, 2])
tm.assert_extension_array_equal(result, expected)
expected = pd.Series(expected)
result = pd.Series(data)[idx]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"idx",
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
ids=["list", "integer-array"],
)
def test_getitem_integer_with_missing_raises(self, data, idx):
msg = "Cannot index with an integer indexer containing NA values"
with pytest.raises(ValueError, match=msg):
data[idx]
@pytest.mark.xfail(
reason="Tries label-based and raises KeyError; "
"in some cases raises when calling np.asarray"
)
@pytest.mark.parametrize(
"idx",
[[0, 1, 2, pd.NA], pd.array([0, 1, 2, pd.NA], dtype="Int64")],
ids=["list", "integer-array"],
)
def test_getitem_series_integer_with_missing_raises(self, data, idx):
msg = "Cannot index with an integer indexer containing NA values"
# TODO: this raises KeyError about labels not found (it tries label-based)
ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
with pytest.raises(ValueError, match=msg):
ser[idx]
def test_getitem_slice(self, data):
# getitem[slice] should return an array
result = data[slice(0)] # empty
assert isinstance(result, type(data))
result = data[slice(1)] # scalar
assert isinstance(result, type(data))
def test_getitem_ellipsis_and_slice(self, data):
# GH#40353 this is called from slice_block_rows
result = data[..., :]
tm.assert_extension_array_equal(result, data)
result = data[:, ...]
tm.assert_extension_array_equal(result, data)
result = data[..., :3]
tm.assert_extension_array_equal(result, data[:3])
result = data[:3, ...]
tm.assert_extension_array_equal(result, data[:3])
result = data[..., ::2]
tm.assert_extension_array_equal(result, data[::2])
result = data[::2, ...]
tm.assert_extension_array_equal(result, data[::2])
def test_get(self, data):
# GH 20882
s = pd.Series(data, index=[2 * i for i in range(len(data))])
assert s.get(4) == s.iloc[2]
result = s.get([4, 6])
expected = s.iloc[[2, 3]]
tm.assert_series_equal(result, expected)
result = s.get(slice(2))
expected = s.iloc[[0, 1]]
tm.assert_series_equal(result, expected)
assert s.get(-1) is None
assert s.get(s.index.max() + 1) is None
s = pd.Series(data[:6], index=list("abcdef"))
assert s.get("c") == s.iloc[2]
result = s.get(slice("b", "d"))
expected = s.iloc[[1, 2, 3]]
tm.assert_series_equal(result, expected)
result = s.get("Z")
assert result is None
msg = "Series.__getitem__ treating keys as positions is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
assert s.get(4) == s.iloc[4]
assert s.get(-1) == s.iloc[-1]
assert s.get(len(s)) is None
# GH 21257
s = pd.Series(data)
with tm.assert_produces_warning(None):
# GH#45324 make sure we aren't giving a spurious FutureWarning
s2 = s[::2]
assert s2.get(1) is None
def test_take_sequence(self, data):
result = pd.Series(data)[[0, 1, 3]]
assert result.iloc[0] == data[0]
assert result.iloc[1] == data[1]
assert result.iloc[2] == data[3]
def test_take(self, data, na_value, na_cmp):
result = data.take([0, -1])
assert result.dtype == data.dtype
assert result[0] == data[0]
assert result[1] == data[-1]
result = data.take([0, -1], allow_fill=True, fill_value=na_value)
assert result[0] == data[0]
assert na_cmp(result[1], na_value)
with pytest.raises(IndexError, match="out of bounds"):
data.take([len(data) + 1])
def test_take_empty(self, data, na_value, na_cmp):
empty = data[:0]
result = empty.take([-1], allow_fill=True)
assert na_cmp(result[0], na_value)
msg = "cannot do a non-empty take from an empty axes|out of bounds"
with pytest.raises(IndexError, match=msg):
empty.take([-1])
with pytest.raises(IndexError, match="cannot do a non-empty take"):
empty.take([0, 1])
def test_take_negative(self, data):
# https://github.com/pandas-dev/pandas/issues/20640
n = len(data)
result = data.take([0, -n, n - 1, -1])
expected = data.take([0, 0, n - 1, n - 1])
tm.assert_extension_array_equal(result, expected)
def test_take_non_na_fill_value(self, data_missing):
fill_value = data_missing[1] # valid
na = data_missing[0]
arr = data_missing._from_sequence(
[na, fill_value, na], dtype=data_missing.dtype
)
result = arr.take([-1, 1], fill_value=fill_value, allow_fill=True)
expected = arr.take([1, 1])
tm.assert_extension_array_equal(result, expected)
def test_take_pandas_style_negative_raises(self, data, na_value):
with pytest.raises(ValueError, match=""):
data.take([0, -2], fill_value=na_value, allow_fill=True)
@pytest.mark.parametrize("allow_fill", [True, False])
def test_take_out_of_bounds_raises(self, data, allow_fill):
arr = data[:3]
with pytest.raises(IndexError, match="out of bounds|out-of-bounds"):
arr.take(np.asarray([0, 3]), allow_fill=allow_fill)
def test_take_series(self, data):
s = pd.Series(data)
result = s.take([0, -1])
expected = pd.Series(
data._from_sequence([data[0], data[len(data) - 1]], dtype=s.dtype),
index=[0, len(data) - 1],
)
tm.assert_series_equal(result, expected)
def test_reindex(self, data, na_value):
s = pd.Series(data)
result = s.reindex([0, 1, 3])
expected = pd.Series(data.take([0, 1, 3]), index=[0, 1, 3])
tm.assert_series_equal(result, expected)
n = len(data)
result = s.reindex([-1, 0, n])
expected = pd.Series(
data._from_sequence([na_value, data[0], na_value], dtype=s.dtype),
index=[-1, 0, n],
)
tm.assert_series_equal(result, expected)
result = s.reindex([n, n + 1])
expected = pd.Series(
data._from_sequence([na_value, na_value], dtype=s.dtype), index=[n, n + 1]
)
tm.assert_series_equal(result, expected)
def test_reindex_non_na_fill_value(self, data_missing):
valid = data_missing[1]
na = data_missing[0]
arr = data_missing._from_sequence([na, valid], dtype=data_missing.dtype)
ser = pd.Series(arr)
result = ser.reindex([0, 1, 2], fill_value=valid)
expected = pd.Series(
data_missing._from_sequence([na, valid, valid], dtype=data_missing.dtype)
)
tm.assert_series_equal(result, expected)
def test_loc_len1(self, data):
# see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim
df = pd.DataFrame({"A": data})
res = df.loc[[0], "A"]
assert res.ndim == 1
assert res._mgr.arrays[0].ndim == 1
if hasattr(res._mgr, "blocks"):
assert res._mgr._block.ndim == 1
def test_item(self, data):
# https://github.com/pandas-dev/pandas/pull/30175
s = pd.Series(data)
result = s[:1].item()
assert result == data[0]
msg = "can only convert an array of size 1 to a Python scalar"
with pytest.raises(ValueError, match=msg):
s[:0].item()
with pytest.raises(ValueError, match=msg):
s.item()

View File

@ -0,0 +1,174 @@
import re
import pytest
from pandas.core.dtypes.common import (
is_bool_dtype,
is_numeric_dtype,
is_object_dtype,
is_string_dtype,
)
import pandas as pd
import pandas._testing as tm
@pytest.mark.filterwarnings(
"ignore:The default of observed=False is deprecated:FutureWarning"
)
class BaseGroupbyTests:
"""Groupby-specific tests."""
def test_grouping_grouper(self, data_for_grouping):
df = pd.DataFrame(
{
"A": pd.Series(
["B", "B", None, None, "A", "A", "B", "C"], dtype=object
),
"B": data_for_grouping,
}
)
gr1 = df.groupby("A")._grouper.groupings[0]
gr2 = df.groupby("B")._grouper.groupings[0]
tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
is_bool = data_for_grouping.dtype._is_boolean
if is_bool:
# only 2 unique values, and the final entry has c==b
# (see data_for_grouping docstring)
df = df.iloc[:-1]
result = df.groupby("B", as_index=as_index).A.mean()
_, uniques = pd.factorize(data_for_grouping, sort=True)
exp_vals = [3.0, 1.0, 4.0]
if is_bool:
exp_vals = exp_vals[:-1]
if as_index:
index = pd.Index(uniques, name="B")
expected = pd.Series(exp_vals, index=index, name="A")
tm.assert_series_equal(result, expected)
else:
expected = pd.DataFrame({"B": uniques, "A": exp_vals})
tm.assert_frame_equal(result, expected)
def test_groupby_agg_extension(self, data_for_grouping):
# GH#38980 groupby agg on extension type fails for non-numeric types
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
expected = df.iloc[[0, 2, 4, 7]]
expected = expected.set_index("A")
result = df.groupby("A").agg({"B": "first"})
tm.assert_frame_equal(result, expected)
result = df.groupby("A").agg("first")
tm.assert_frame_equal(result, expected)
result = df.groupby("A").first()
tm.assert_frame_equal(result, expected)
def test_groupby_extension_no_sort(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
is_bool = data_for_grouping.dtype._is_boolean
if is_bool:
# only 2 unique values, and the final entry has c==b
# (see data_for_grouping docstring)
df = df.iloc[:-1]
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
index = pd.Index(index, name="B")
exp_vals = [1.0, 3.0, 4.0]
if is_bool:
exp_vals = exp_vals[:-1]
expected = pd.Series(exp_vals, index=index, name="A")
tm.assert_series_equal(result, expected)
def test_groupby_extension_transform(self, data_for_grouping):
is_bool = data_for_grouping.dtype._is_boolean
valid = data_for_grouping[~data_for_grouping.isna()]
df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
is_bool = data_for_grouping.dtype._is_boolean
if is_bool:
# only 2 unique values, and the final entry has c==b
# (see data_for_grouping docstring)
df = df.iloc[:-1]
result = df.groupby("B").A.transform(len)
expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
if is_bool:
expected = expected[:-1]
tm.assert_series_equal(result, expected)
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
df.groupby("B", group_keys=False, observed=False).apply(groupby_apply_op)
df.groupby("B", group_keys=False, observed=False).A.apply(groupby_apply_op)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
df.groupby("A", group_keys=False, observed=False).apply(groupby_apply_op)
df.groupby("A", group_keys=False, observed=False).B.apply(groupby_apply_op)
def test_groupby_apply_identity(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
result = df.groupby("A").B.apply(lambda x: x.array)
expected = pd.Series(
[
df.B.iloc[[0, 1, 6]].array,
df.B.iloc[[2, 3]].array,
df.B.iloc[[4, 5]].array,
df.B.iloc[[7]].array,
],
index=pd.Index([1, 2, 3, 4], name="A"),
name="B",
)
tm.assert_series_equal(result, expected)
def test_in_numeric_groupby(self, data_for_grouping):
df = pd.DataFrame(
{
"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping,
"C": [1, 1, 1, 1, 1, 1, 1, 1],
}
)
dtype = data_for_grouping.dtype
if (
is_numeric_dtype(dtype)
or is_bool_dtype(dtype)
or dtype.name == "decimal"
or is_string_dtype(dtype)
or is_object_dtype(dtype)
or dtype.kind == "m" # in particular duration[*][pyarrow]
):
expected = pd.Index(["B", "C"])
result = df.groupby("A").sum().columns
else:
expected = pd.Index(["C"])
msg = "|".join(
[
# period/datetime
"does not support sum operations",
# all others
re.escape(f"agg function failed [how->sum,dtype->{dtype}"),
]
)
with pytest.raises(TypeError, match=msg):
df.groupby("A").sum()
result = df.groupby("A").sum(numeric_only=True).columns
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,19 @@
"""
Tests for Indexes backed by arbitrary ExtensionArrays.
"""
import pandas as pd
class BaseIndexTests:
"""Tests for Index object backed by an ExtensionArray"""
def test_index_from_array(self, data):
idx = pd.Index(data)
assert data.dtype == idx.dtype
def test_index_from_listlike_with_dtype(self, data):
idx = pd.Index(data, dtype=data.dtype)
assert idx.dtype == data.dtype
idx = pd.Index(list(data), dtype=data.dtype)
assert idx.dtype == data.dtype

View File

@ -0,0 +1,137 @@
import numpy as np
import pytest
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import is_extension_array_dtype
from pandas.core.dtypes.dtypes import ExtensionDtype
import pandas as pd
import pandas._testing as tm
class BaseInterfaceTests:
"""Tests that the basic interface is satisfied."""
# ------------------------------------------------------------------------
# Interface
# ------------------------------------------------------------------------
def test_len(self, data):
assert len(data) == 100
def test_size(self, data):
assert data.size == 100
def test_ndim(self, data):
assert data.ndim == 1
def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True
def test_contains(self, data, data_missing):
# GH-37867
# Tests for membership checks. Membership checks for nan-likes is tricky and
# the settled on rule is: `nan_like in arr` is True if nan_like is
# arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.
na_value = data.dtype.na_value
# ensure data without missing values
data = data[~data.isna()]
# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing
# check the presence of na_value
assert na_value in data_missing
assert na_value not in data
# the data can never contain other nan-likes than na_value
for na_value_obj in tm.NULL_OBJECTS:
if na_value_obj is na_value or type(na_value_obj) == type(na_value):
# type check for e.g. two instances of Decimal("NAN")
continue
assert na_value_obj not in data
assert na_value_obj not in data_missing
def test_memory_usage(self, data):
s = pd.Series(data)
result = s.memory_usage(index=False)
assert result == s.nbytes
def test_array_interface(self, data):
result = np.array(data)
assert result[0] == data[0]
result = np.array(data, dtype=object)
expected = np.array(list(data), dtype=object)
if expected.ndim > 1:
# nested data, explicitly construct as 1D
expected = construct_1d_object_array_from_listlike(list(data))
tm.assert_numpy_array_equal(result, expected)
def test_is_extension_array_dtype(self, data):
assert is_extension_array_dtype(data)
assert is_extension_array_dtype(data.dtype)
assert is_extension_array_dtype(pd.Series(data))
assert isinstance(data.dtype, ExtensionDtype)
def test_no_values_attribute(self, data):
# GH-20735: EA's with .values attribute give problems with internal
# code, disallowing this for now until solved
assert not hasattr(data, "values")
assert not hasattr(data, "_values")
def test_is_numeric_honored(self, data):
result = pd.Series(data)
if hasattr(result._mgr, "blocks"):
assert result._mgr.blocks[0].is_numeric is data.dtype._is_numeric
def test_isna_extension_array(self, data_missing):
# If your `isna` returns an ExtensionArray, you must also implement
# _reduce. At the *very* least, you must implement any and all
na = data_missing.isna()
if is_extension_array_dtype(na):
assert na._reduce("any")
assert na.any()
assert not na._reduce("all")
assert not na.all()
assert na.dtype._is_boolean
def test_copy(self, data):
# GH#27083 removing deep keyword from EA.copy
assert data[0] != data[1]
result = data.copy()
if data.dtype._is_immutable:
pytest.skip(f"test_copy assumes mutability and {data.dtype} is immutable")
data[1] = data[0]
assert result[1] != result[0]
def test_view(self, data):
# view with no dtype should return a shallow copy, *not* the same
# object
assert data[1] != data[0]
result = data.view()
assert result is not data
assert type(result) == type(data)
if data.dtype._is_immutable:
pytest.skip(f"test_view assumes mutability and {data.dtype} is immutable")
result[1] = result[0]
assert data[1] == data[0]
# check specifically that the `dtype` kwarg is accepted
data.view(dtype=None)
def test_tolist(self, data):
result = data.tolist()
expected = list(data)
assert isinstance(result, list)
assert result == expected

View File

@ -0,0 +1,39 @@
from io import StringIO
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import ExtensionArray
class BaseParsingTests:
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data, request):
if isinstance(data.dtype, pd.CategoricalDtype):
# in parsers.pyx _convert_with_dtype there is special-casing for
# Categorical that pre-empts _from_sequence_of_strings
pass
elif isinstance(data.dtype, pd.core.dtypes.dtypes.NumpyEADtype):
# These get unwrapped internally so are treated as numpy dtypes
# in the parsers.pyx code
pass
elif (
type(data)._from_sequence_of_strings.__func__
is ExtensionArray._from_sequence_of_strings.__func__
):
# i.e. the EA hasn't overridden _from_sequence_of_strings
mark = pytest.mark.xfail(
reason="_from_sequence_of_strings not implemented",
raises=NotImplementedError,
)
request.node.add_marker(mark)
df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
csv_output = df.to_csv(index=False, na_rep=np.nan)
result = pd.read_csv(
StringIO(csv_output), dtype={"with_dtype": str(data.dtype)}, engine=engine
)
expected = df
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,720 @@
import inspect
import operator
import numpy as np
import pytest
from pandas._typing import Dtype
from pandas.core.dtypes.common import is_bool_dtype
from pandas.core.dtypes.dtypes import NumpyEADtype
from pandas.core.dtypes.missing import na_value_for_dtype
import pandas as pd
import pandas._testing as tm
from pandas.core.sorting import nargsort
class BaseMethodsTests:
"""Various Series and DataFrame methods."""
def test_hash_pandas_object(self, data):
# _hash_pandas_object should return a uint64 ndarray of the same length
# as the data
from pandas.core.util.hashing import _default_hash_key
res = data._hash_pandas_object(
encoding="utf-8", hash_key=_default_hash_key, categorize=False
)
assert res.dtype == np.uint64
assert res.shape == data.shape
def test_value_counts_default_dropna(self, data):
# make sure we have consistent default dropna kwarg
if not hasattr(data, "value_counts"):
pytest.skip(f"value_counts is not implemented for {type(data)}")
sig = inspect.signature(data.value_counts)
kwarg = sig.parameters["dropna"]
assert kwarg.default is True
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
all_data = all_data[:10]
if dropna:
other = all_data[~all_data.isna()]
else:
other = all_data
result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize(self, data):
# GH 33172
data = data[:10].unique()
values = np.array(data[~data.isna()])
ser = pd.Series(data, dtype=data.dtype)
result = ser.value_counts(normalize=True).sort_index()
if not isinstance(data, pd.Categorical):
expected = pd.Series(
[1 / len(values)] * len(values), index=result.index, name="proportion"
)
else:
expected = pd.Series(0.0, index=result.index, name="proportion")
expected[result > 0] = 1 / len(values)
if getattr(data.dtype, "storage", "") == "pyarrow" or isinstance(
data.dtype, pd.ArrowDtype
):
# TODO: avoid special-casing
expected = expected.astype("double[pyarrow]")
elif getattr(data.dtype, "storage", "") == "pyarrow_numpy":
# TODO: avoid special-casing
expected = expected.astype("float64")
elif na_value_for_dtype(data.dtype) is pd.NA:
# TODO(GH#44692): avoid special-casing
expected = expected.astype("Float64")
tm.assert_series_equal(result, expected)
def test_count(self, data_missing):
df = pd.DataFrame({"A": data_missing})
result = df.count(axis="columns")
expected = pd.Series([0, 1])
tm.assert_series_equal(result, expected)
def test_series_count(self, data_missing):
# GH#26835
ser = pd.Series(data_missing)
result = ser.count()
expected = 1
assert result == expected
def test_apply_simple_series(self, data):
result = pd.Series(data).apply(id)
assert isinstance(result, pd.Series)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data_missing, na_action):
result = data_missing.map(lambda x: x, na_action=na_action)
expected = data_missing.to_numpy()
tm.assert_numpy_array_equal(result, expected)
def test_argsort(self, data_for_sorting):
result = pd.Series(data_for_sorting).argsort()
# argsort result gets passed to take, so should be np.intp
expected = pd.Series(np.array([2, 0, 1], dtype=np.intp))
tm.assert_series_equal(result, expected)
def test_argsort_missing_array(self, data_missing_for_sorting):
result = data_missing_for_sorting.argsort()
# argsort result gets passed to take, so should be np.intp
expected = np.array([2, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
def test_argsort_missing(self, data_missing_for_sorting):
msg = "The behavior of Series.argsort in the presence of NA values"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(data_missing_for_sorting).argsort()
expected = pd.Series(np.array([1, -1, 0], dtype=np.intp))
tm.assert_series_equal(result, expected)
def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
# GH 24382
is_bool = data_for_sorting.dtype._is_boolean
exp_argmax = 1
exp_argmax_repeated = 3
if is_bool:
# See data_for_sorting docstring
exp_argmax = 0
exp_argmax_repeated = 1
# data_for_sorting -> [B, C, A] with A < B < C
assert data_for_sorting.argmax() == exp_argmax
assert data_for_sorting.argmin() == 2
# with repeated values -> first occurrence
data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
assert data.argmax() == exp_argmax_repeated
assert data.argmin() == 0
# with missing values
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
assert data_missing_for_sorting.argmax() == 0
assert data_missing_for_sorting.argmin() == 2
@pytest.mark.parametrize("method", ["argmax", "argmin"])
def test_argmin_argmax_empty_array(self, method, data):
# GH 24382
err_msg = "attempt to get"
with pytest.raises(ValueError, match=err_msg):
getattr(data[:0], method)()
@pytest.mark.parametrize("method", ["argmax", "argmin"])
def test_argmin_argmax_all_na(self, method, data, na_value):
# all missing with skipna=True is the same as empty
err_msg = "attempt to get"
data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
with pytest.raises(ValueError, match=err_msg):
getattr(data_na, method)()
@pytest.mark.parametrize(
"op_name, skipna, expected",
[
("idxmax", True, 0),
("idxmin", True, 2),
("argmax", True, 0),
("argmin", True, 2),
("idxmax", False, np.nan),
("idxmin", False, np.nan),
("argmax", False, -1),
("argmin", False, -1),
],
)
def test_argreduce_series(
self, data_missing_for_sorting, op_name, skipna, expected
):
# data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
warn = None
msg = "The behavior of Series.argmax/argmin"
if op_name.startswith("arg") and expected == -1:
warn = FutureWarning
if op_name.startswith("idx") and np.isnan(expected):
warn = FutureWarning
msg = f"The behavior of Series.{op_name}"
ser = pd.Series(data_missing_for_sorting)
with tm.assert_produces_warning(warn, match=msg):
result = getattr(ser, op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)
def test_argmax_argmin_no_skipna_notimplemented(self, data_missing_for_sorting):
# GH#38733
data = data_missing_for_sorting
with pytest.raises(NotImplementedError, match=""):
data.argmin(skipna=False)
with pytest.raises(NotImplementedError, match=""):
data.argmax(skipna=False)
@pytest.mark.parametrize(
"na_position, expected",
[
("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
],
)
def test_nargsort(self, data_missing_for_sorting, na_position, expected):
# GH 25439
result = nargsort(data_missing_for_sorting, na_position=na_position)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
ser = pd.Series(data_for_sorting)
result = ser.sort_values(ascending=ascending, key=sort_by_key)
expected = ser.iloc[[2, 0, 1]]
if not ascending:
# GH 35922. Expect stable sort
if ser.nunique() == 2:
expected = ser.iloc[[0, 1, 2]]
else:
expected = ser.iloc[[1, 0, 2]]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_missing(
self, data_missing_for_sorting, ascending, sort_by_key
):
ser = pd.Series(data_missing_for_sorting)
result = ser.sort_values(ascending=ascending, key=sort_by_key)
if ascending:
expected = ser.iloc[[2, 0, 1]]
else:
expected = ser.iloc[[0, 2, 1]]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
result = df.sort_values(["A", "B"])
expected = pd.DataFrame(
{"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("keep", ["first", "last", False])
def test_duplicated(self, data, keep):
arr = data.take([0, 1, 0, 1])
result = arr.duplicated(keep=keep)
if keep == "first":
expected = np.array([False, False, True, True])
elif keep == "last":
expected = np.array([True, True, False, False])
else:
expected = np.array([True, True, True, True])
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method):
duplicated = box(data._from_sequence([data[0], data[0]], dtype=data.dtype))
result = method(duplicated)
assert len(result) == 1
assert isinstance(result, type(data))
assert result[0] == duplicated[0]
def test_factorize(self, data_for_grouping):
codes, uniques = pd.factorize(data_for_grouping, use_na_sentinel=True)
is_bool = data_for_grouping.dtype._is_boolean
if is_bool:
# only 2 unique values
expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 0], dtype=np.intp)
expected_uniques = data_for_grouping.take([0, 4])
else:
expected_codes = np.array([0, 0, -1, -1, 1, 1, 0, 2], dtype=np.intp)
expected_uniques = data_for_grouping.take([0, 4, 7])
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_extension_array_equal(uniques, expected_uniques)
def test_factorize_equivalence(self, data_for_grouping):
codes_1, uniques_1 = pd.factorize(data_for_grouping, use_na_sentinel=True)
codes_2, uniques_2 = data_for_grouping.factorize(use_na_sentinel=True)
tm.assert_numpy_array_equal(codes_1, codes_2)
tm.assert_extension_array_equal(uniques_1, uniques_2)
assert len(uniques_1) == len(pd.unique(uniques_1))
assert uniques_1.dtype == data_for_grouping.dtype
def test_factorize_empty(self, data):
codes, uniques = pd.factorize(data[:0])
expected_codes = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_extension_array_equal(uniques, expected_uniques)
def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
df_orig = df.copy()
filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)
result.iloc[0, 0] = filled_val
tm.assert_frame_equal(df, df_orig)
def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr, copy=False)
ser_orig = ser.copy()
filled_val = ser[0]
result = ser.fillna(filled_val)
result.iloc[0] = filled_val
tm.assert_series_equal(ser, ser_orig)
def test_fillna_length_mismatch(self, data_missing):
msg = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg):
data_missing.fillna(data_missing.take([1]))
# Subclasses can override if we expect e.g Sparse[bool], boolean, pyarrow[bool]
_combine_le_expected_dtype: Dtype = NumpyEADtype("bool")
def test_combine_le(self, data_repeated):
# GH 20825
# Test that combine works when doing a <= (le) comparison
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 <= x2)
expected = pd.Series(
pd.array(
[a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))],
dtype=self._combine_le_expected_dtype,
)
)
tm.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 <= x2)
expected = pd.Series(
pd.array(
[a <= val for a in list(orig_data1)],
dtype=self._combine_le_expected_dtype,
)
)
tm.assert_series_equal(result, expected)
def test_combine_add(self, data_repeated):
# GH 20825
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
# Check if the operation is supported pointwise for our scalars. If not,
# we will expect Series.combine to raise as well.
try:
with np.errstate(over="ignore"):
expected = pd.Series(
orig_data1._from_sequence(
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
)
)
except TypeError:
# If the operation is not supported pointwise for our scalars,
# then Series.combine should also raise
with pytest.raises(TypeError):
s1.combine(s2, lambda x1, x2: x1 + x2)
return
result = s1.combine(s2, lambda x1, x2: x1 + x2)
tm.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 + x2)
expected = pd.Series(
orig_data1._from_sequence([a + val for a in list(orig_data1)])
)
tm.assert_series_equal(result, expected)
def test_combine_first(self, data):
# https://github.com/pandas-dev/pandas/issues/24147
a = pd.Series(data[:3])
b = pd.Series(data[2:5], index=[2, 3, 4])
result = a.combine_first(b)
expected = pd.Series(data[:5])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("frame", [True, False])
@pytest.mark.parametrize(
"periods, indices",
[(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
)
def test_container_shift(self, data, frame, periods, indices):
# https://github.com/pandas-dev/pandas/issues/22386
subset = data[:5]
data = pd.Series(subset, name="A")
expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
if frame:
result = data.to_frame(name="A").assign(B=1).shift(periods)
expected = pd.concat(
[expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
)
compare = tm.assert_frame_equal
else:
result = data.shift(periods)
compare = tm.assert_series_equal
compare(result, expected)
def test_shift_0_periods(self, data):
# GH#33856 shifting with periods=0 should return a copy, not same obj
result = data.shift(0)
assert data[0] != data[1] # otherwise below is invalid
data[0] = data[1]
assert result[0] != result[1] # i.e. not the same object/view
@pytest.mark.parametrize("periods", [1, -2])
def test_diff(self, data, periods):
data = data[:5]
if is_bool_dtype(data.dtype):
op = operator.xor
else:
op = operator.sub
try:
# does this array implement ops?
op(data, data)
except Exception:
pytest.skip(f"{type(data)} does not support diff")
s = pd.Series(data)
result = s.diff(periods)
expected = pd.Series(op(data, data.shift(periods)))
tm.assert_series_equal(result, expected)
df = pd.DataFrame({"A": data, "B": [1.0] * 5})
result = df.diff(periods)
if periods == 1:
b = [np.nan, 0, 0, 0, 0]
else:
b = [0, 0, 0, np.nan, np.nan]
expected = pd.DataFrame({"A": expected, "B": b})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"periods, indices",
[[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
)
def test_shift_non_empty_array(self, data, periods, indices):
# https://github.com/pandas-dev/pandas/issues/23911
subset = data[:2]
result = subset.shift(periods)
expected = subset.take(indices, allow_fill=True)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
def test_shift_empty_array(self, data, periods):
# https://github.com/pandas-dev/pandas/issues/23911
empty = data[:0]
result = empty.shift(periods)
expected = empty
tm.assert_extension_array_equal(result, expected)
def test_shift_zero_copies(self, data):
# GH#31502
result = data.shift(0)
assert result is not data
result = data[:0].shift(2)
assert result is not data
def test_shift_fill_value(self, data):
arr = data[:4]
fill_value = data[0]
result = arr.shift(1, fill_value=fill_value)
expected = data.take([0, 0, 1, 2])
tm.assert_extension_array_equal(result, expected)
result = arr.shift(-2, fill_value=fill_value)
expected = data.take([2, 3, 0, 0])
tm.assert_extension_array_equal(result, expected)
def test_not_hashable(self, data):
# We are in general mutable, so not hashable
with pytest.raises(TypeError, match="unhashable type"):
hash(data)
def test_hash_pandas_object_works(self, data, as_frame):
# https://github.com/pandas-dev/pandas/issues/23066
data = pd.Series(data)
if as_frame:
data = data.to_frame()
a = pd.util.hash_pandas_object(data)
b = pd.util.hash_pandas_object(data)
tm.assert_equal(a, b)
def test_searchsorted(self, data_for_sorting, as_series):
if data_for_sorting.dtype._is_boolean:
return self._test_searchsorted_bool_dtypes(data_for_sorting, as_series)
b, c, a = data_for_sorting
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
if as_series:
arr = pd.Series(arr)
assert arr.searchsorted(a) == 0
assert arr.searchsorted(a, side="right") == 1
assert arr.searchsorted(b) == 1
assert arr.searchsorted(b, side="right") == 2
assert arr.searchsorted(c) == 2
assert arr.searchsorted(c, side="right") == 3
result = arr.searchsorted(arr.take([0, 2]))
expected = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
# sorter
sorter = np.array([1, 2, 0])
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
# We call this from test_searchsorted in cases where we have a
# boolean-like dtype. The non-bool test assumes we have more than 2
# unique values.
dtype = data_for_sorting.dtype
data_for_sorting = pd.array([True, False], dtype=dtype)
b, a = data_for_sorting
arr = type(data_for_sorting)._from_sequence([a, b])
if as_series:
arr = pd.Series(arr)
assert arr.searchsorted(a) == 0
assert arr.searchsorted(a, side="right") == 1
assert arr.searchsorted(b) == 1
assert arr.searchsorted(b, side="right") == 2
result = arr.searchsorted(arr.take([0, 1]))
expected = np.array([0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
# sorter
sorter = np.array([1, 0])
assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
def test_where_series(self, data, na_value, as_frame):
assert data[0] != data[1]
cls = type(data)
a, b = data[:2]
orig = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
ser = orig.copy()
cond = np.array([True, True, False, False])
if as_frame:
ser = ser.to_frame(name="a")
cond = cond.reshape(-1, 1)
result = ser.where(cond)
expected = pd.Series(
cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
)
if as_frame:
expected = expected.to_frame(name="a")
tm.assert_equal(result, expected)
ser.mask(~cond, inplace=True)
tm.assert_equal(ser, expected)
# array other
ser = orig.copy()
if as_frame:
ser = ser.to_frame(name="a")
cond = np.array([True, False, True, True])
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
if as_frame:
other = pd.DataFrame({"a": other})
cond = pd.DataFrame({"a": cond})
result = ser.where(cond, other)
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
if as_frame:
expected = expected.to_frame(name="a")
tm.assert_equal(result, expected)
ser.mask(~cond, other, inplace=True)
tm.assert_equal(ser, expected)
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
def test_repeat(self, data, repeats, as_series, use_numpy):
arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
if as_series:
arr = pd.Series(arr)
result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
expected = type(data)._from_sequence(expected, dtype=data.dtype)
if as_series:
expected = pd.Series(expected, index=arr.index.repeat(repeats))
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"repeats, kwargs, error, msg",
[
(2, {"axis": 1}, ValueError, "axis"),
(-1, {}, ValueError, "negative"),
([1, 2], {}, ValueError, "shape"),
(2, {"foo": "bar"}, TypeError, "'foo'"),
],
)
def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
with pytest.raises(error, match=msg):
if use_numpy:
np.repeat(data, repeats, **kwargs)
else:
data.repeat(repeats, **kwargs)
def test_delete(self, data):
result = data.delete(0)
expected = data[1:]
tm.assert_extension_array_equal(result, expected)
result = data.delete([1, 3])
expected = data._concat_same_type([data[[0]], data[[2]], data[4:]])
tm.assert_extension_array_equal(result, expected)
def test_insert(self, data):
# insert at the beginning
result = data[1:].insert(0, data[0])
tm.assert_extension_array_equal(result, data)
result = data[1:].insert(-len(data[1:]), data[0])
tm.assert_extension_array_equal(result, data)
# insert at the middle
result = data[:-1].insert(4, data[-1])
taker = np.arange(len(data))
taker[5:] = taker[4:-1]
taker[4] = len(data) - 1
expected = data.take(taker)
tm.assert_extension_array_equal(result, expected)
def test_insert_invalid(self, data, invalid_scalar):
item = invalid_scalar
with pytest.raises((TypeError, ValueError)):
data.insert(0, item)
with pytest.raises((TypeError, ValueError)):
data.insert(4, item)
with pytest.raises((TypeError, ValueError)):
data.insert(len(data) - 1, item)
def test_insert_invalid_loc(self, data):
ub = len(data)
with pytest.raises(IndexError):
data.insert(ub + 1, data[0])
with pytest.raises(IndexError):
data.insert(-ub - 1, data[0])
with pytest.raises(TypeError):
# we expect TypeError here instead of IndexError to match np.insert
data.insert(1.5, data[0])
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
def test_equals(self, data, na_value, as_series, box):
data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
data = tm.box_expected(data, box, transpose=False)
data2 = tm.box_expected(data2, box, transpose=False)
data_na = tm.box_expected(data_na, box, transpose=False)
# we are asserting with `is True/False` explicitly, to test that the
# result is an actual Python bool, and not something "truthy"
assert data.equals(data) is True
assert data.equals(data.copy()) is True
# unequal other data
assert data.equals(data2) is False
assert data.equals(data_na) is False
# different length
assert data[:2].equals(data[:3]) is False
# empty are equal
assert data[:0].equals(data[:0]) is True
# other types
assert data.equals(None) is False
assert data[[0]].equals(data[0]) is False
def test_equals_same_data_different_object(self, data):
# https://github.com/pandas-dev/pandas/issues/34660
assert pd.Series(data).equals(pd.Series(data))

View File

@ -0,0 +1,190 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class BaseMissingTests:
def test_isna(self, data_missing):
expected = np.array([True, False])
result = pd.isna(data_missing)
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(data_missing).isna()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
# GH 21189
result = pd.Series(data_missing).drop([0, 1]).isna()
expected = pd.Series([], dtype=bool)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("na_func", ["isna", "notna"])
def test_isna_returns_copy(self, data_missing, na_func):
result = pd.Series(data_missing)
expected = result.copy()
mask = getattr(result, na_func)()
if isinstance(mask.dtype, pd.SparseDtype):
# TODO: GH 57739
mask = np.array(mask)
mask.flags.writeable = True
mask[:] = True
tm.assert_series_equal(result, expected)
def test_dropna_array(self, data_missing):
result = data_missing.dropna()
expected = data_missing[[1]]
tm.assert_extension_array_equal(result, expected)
def test_dropna_series(self, data_missing):
ser = pd.Series(data_missing)
result = ser.dropna()
expected = ser.iloc[[1]]
tm.assert_series_equal(result, expected)
def test_dropna_frame(self, data_missing):
df = pd.DataFrame({"A": data_missing}, columns=pd.Index(["A"], dtype=object))
# defaults
result = df.dropna()
expected = df.iloc[[1]]
tm.assert_frame_equal(result, expected)
# axis = 1
result = df.dropna(axis="columns")
expected = pd.DataFrame(index=pd.RangeIndex(2), columns=pd.Index([]))
tm.assert_frame_equal(result, expected)
# multiple
df = pd.DataFrame({"A": data_missing, "B": [1, np.nan]})
result = df.dropna()
expected = df.iloc[:0]
tm.assert_frame_equal(result, expected)
def test_fillna_scalar(self, data_missing):
valid = data_missing[1]
result = data_missing.fillna(valid)
expected = data_missing.fillna(valid)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:Series.fillna with 'method' is deprecated:FutureWarning"
)
def test_fillna_limit_pad(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).ffill(limit=2)
expected = pd.Series(data_missing.take([1, 1, 1, 0, 1]))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"limit_area, input_ilocs, expected_ilocs",
[
("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
],
)
def test_ffill_limit_area(
self, data_missing, limit_area, input_ilocs, expected_ilocs
):
# GH#56616
arr = data_missing.take(input_ilocs)
result = pd.Series(arr).ffill(limit_area=limit_area)
expected = pd.Series(data_missing.take(expected_ilocs))
tm.assert_series_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:Series.fillna with 'method' is deprecated:FutureWarning"
)
def test_fillna_limit_backfill(self, data_missing):
arr = data_missing.take([1, 0, 0, 0, 1])
result = pd.Series(arr).fillna(method="backfill", limit=2)
expected = pd.Series(data_missing.take([1, 0, 1, 1, 1]))
tm.assert_series_equal(result, expected)
def test_fillna_no_op_returns_copy(self, data):
data = data[~data.isna()]
valid = data[0]
result = data.fillna(valid)
assert result is not data
tm.assert_extension_array_equal(result, data)
result = data._pad_or_backfill(method="backfill")
assert result is not data
tm.assert_extension_array_equal(result, data)
def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
result = ser.fillna(fill_value)
expected = pd.Series(
data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
)
)
tm.assert_series_equal(result, expected)
# Fill with a series
result = ser.fillna(expected)
tm.assert_series_equal(result, expected)
# Fill with a series not affecting the missing values
result = ser.fillna(ser)
tm.assert_series_equal(result, ser)
def test_fillna_series_method(self, data_missing, fillna_method):
fill_value = data_missing[1]
if fillna_method == "ffill":
data_missing = data_missing[::-1]
result = getattr(pd.Series(data_missing), fillna_method)()
expected = pd.Series(
data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
)
)
tm.assert_series_equal(result, expected)
def test_fillna_frame(self, data_missing):
fill_value = data_missing[1]
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
expected = pd.DataFrame(
{
"A": data_missing._from_sequence(
[fill_value, fill_value], dtype=data_missing.dtype
),
"B": [1, 2],
}
)
tm.assert_frame_equal(result, expected)
def test_fillna_fill_other(self, data):
result = pd.DataFrame({"A": data, "B": [np.nan] * len(data)}).fillna({"B": 0.0})
expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)})
tm.assert_frame_equal(result, expected)
def test_use_inf_as_na_no_effect(self, data_missing):
ser = pd.Series(data_missing)
expected = ser.isna()
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
result = ser.isna()
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,299 @@
from __future__ import annotations
from typing import final
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas.core.dtypes.common import is_string_dtype
import pandas as pd
import pandas._testing as tm
from pandas.core import ops
class BaseOpsUtil:
series_scalar_exc: type[Exception] | None = TypeError
frame_scalar_exc: type[Exception] | None = TypeError
series_array_exc: type[Exception] | None = TypeError
divmod_exc: type[Exception] | None = TypeError
def _get_expected_exception(
self, op_name: str, obj, other
) -> type[Exception] | None:
# Find the Exception, if any we expect to raise calling
# obj.__op_name__(other)
# The self.obj_bar_exc pattern isn't great in part because it can depend
# on op_name or dtypes, but we use it here for backward-compatibility.
if op_name in ["__divmod__", "__rdivmod__"]:
result = self.divmod_exc
elif isinstance(obj, pd.Series) and isinstance(other, pd.Series):
result = self.series_array_exc
elif isinstance(obj, pd.Series):
result = self.series_scalar_exc
else:
result = self.frame_scalar_exc
if using_pyarrow_string_dtype() and result is not None:
import pyarrow as pa
result = ( # type: ignore[assignment]
result,
pa.lib.ArrowNotImplementedError,
NotImplementedError,
)
return result
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
# In _check_op we check that the result of a pointwise operation
# (found via _combine) matches the result of the vectorized
# operation obj.__op_name__(other).
# In some cases pandas dtype inference on the scalar result may not
# give a matching dtype even if both operations are behaving "correctly".
# In these cases, do extra required casting here.
return pointwise_result
def get_op_from_name(self, op_name: str):
return tm.get_op_from_name(op_name)
# Subclasses are not expected to need to override check_opname, _check_op,
# _check_divmod_op, or _combine.
# Ideally any relevant overriding can be done in _cast_pointwise_result,
# get_op_from_name, and the specification of `exc`. If you find a use
# case that still requires overriding _check_op or _combine, please let
# us know at github.com/pandas-dev/pandas/issues
@final
def check_opname(self, ser: pd.Series, op_name: str, other):
exc = self._get_expected_exception(op_name, ser, other)
op = self.get_op_from_name(op_name)
self._check_op(ser, op, other, op_name, exc)
# see comment on check_opname
@final
def _combine(self, obj, other, op):
if isinstance(obj, pd.DataFrame):
if len(obj.columns) != 1:
raise NotImplementedError
expected = obj.iloc[:, 0].combine(other, op).to_frame()
else:
expected = obj.combine(other, op)
return expected
# see comment on check_opname
@final
def _check_op(
self, ser: pd.Series, op, other, op_name: str, exc=NotImplementedError
):
# Check that the Series/DataFrame arithmetic/comparison method matches
# the pointwise result from _combine.
if exc is None:
result = op(ser, other)
expected = self._combine(ser, other, op)
expected = self._cast_pointwise_result(op_name, ser, other, expected)
assert isinstance(result, type(ser))
tm.assert_equal(result, expected)
else:
with pytest.raises(exc):
op(ser, other)
# see comment on check_opname
@final
def _check_divmod_op(self, ser: pd.Series, op, other):
# check that divmod behavior matches behavior of floordiv+mod
if op is divmod:
exc = self._get_expected_exception("__divmod__", ser, other)
else:
exc = self._get_expected_exception("__rdivmod__", ser, other)
if exc is None:
result_div, result_mod = op(ser, other)
if op is divmod:
expected_div, expected_mod = ser // other, ser % other
else:
expected_div, expected_mod = other // ser, other % ser
tm.assert_series_equal(result_div, expected_div)
tm.assert_series_equal(result_mod, expected_mod)
else:
with pytest.raises(exc):
divmod(ser, other)
class BaseArithmeticOpsTests(BaseOpsUtil):
"""
Various Series and DataFrame arithmetic ops methods.
Subclasses supporting various ops should set the class variables
to indicate that they support ops of that kind
* series_scalar_exc = TypeError
* frame_scalar_exc = TypeError
* series_array_exc = TypeError
* divmod_exc = TypeError
"""
series_scalar_exc: type[Exception] | None = TypeError
frame_scalar_exc: type[Exception] | None = TypeError
series_array_exc: type[Exception] | None = TypeError
divmod_exc: type[Exception] | None = TypeError
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
# series & scalar
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
pytest.skip("Skip testing Python string formatting")
op_name = all_arithmetic_operators
ser = pd.Series(data)
self.check_opname(ser, op_name, ser.iloc[0])
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
pytest.skip("Skip testing Python string formatting")
op_name = all_arithmetic_operators
df = pd.DataFrame({"A": data})
self.check_opname(df, op_name, data[0])
def test_arith_series_with_array(self, data, all_arithmetic_operators):
# ndarray & other series
op_name = all_arithmetic_operators
ser = pd.Series(data)
self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
def test_divmod(self, data):
ser = pd.Series(data)
self._check_divmod_op(ser, divmod, 1)
self._check_divmod_op(1, ops.rdivmod, ser)
def test_divmod_series_array(self, data, data_for_twos):
ser = pd.Series(data)
self._check_divmod_op(ser, divmod, data)
other = data_for_twos
self._check_divmod_op(other, ops.rdivmod, ser)
other = pd.Series(other)
self._check_divmod_op(other, ops.rdivmod, ser)
def test_add_series_with_extension_array(self, data):
# Check adding an ExtensionArray to a Series of the same dtype matches
# the behavior of adding the arrays directly and then wrapping in a
# Series.
ser = pd.Series(data)
exc = self._get_expected_exception("__add__", ser, data)
if exc is not None:
with pytest.raises(exc):
ser + data
return
result = ser + data
expected = pd.Series(data + data)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("box", [pd.Series, pd.DataFrame, pd.Index])
@pytest.mark.parametrize(
"op_name",
[
x
for x in tm.arithmetic_dunder_methods + tm.comparison_dunder_methods
if not x.startswith("__r")
],
)
def test_direct_arith_with_ndframe_returns_not_implemented(
self, data, box, op_name
):
# EAs should return NotImplemented for ops with Series/DataFrame/Index
# Pandas takes care of unboxing the series and calling the EA's op.
other = box(data)
if hasattr(data, op_name):
result = getattr(data, op_name)(other)
assert result is NotImplemented
class BaseComparisonOpsTests(BaseOpsUtil):
"""Various Series and DataFrame comparison ops methods."""
def _compare_other(self, ser: pd.Series, data, op, other):
if op.__name__ in ["eq", "ne"]:
# comparison should match point-wise comparisons
result = op(ser, other)
expected = ser.combine(other, op)
expected = self._cast_pointwise_result(op.__name__, ser, other, expected)
tm.assert_series_equal(result, expected)
else:
exc = None
try:
result = op(ser, other)
except Exception as err:
exc = err
if exc is None:
# Didn't error, then should match pointwise behavior
expected = ser.combine(other, op)
expected = self._cast_pointwise_result(
op.__name__, ser, other, expected
)
tm.assert_series_equal(result, expected)
else:
with pytest.raises(type(exc)):
ser.combine(other, op)
def test_compare_scalar(self, data, comparison_op):
ser = pd.Series(data)
self._compare_other(ser, data, comparison_op, 0)
def test_compare_array(self, data, comparison_op):
ser = pd.Series(data)
other = pd.Series([data[0]] * len(data), dtype=data.dtype)
self._compare_other(ser, data, comparison_op, other)
class BaseUnaryOpsTests(BaseOpsUtil):
def test_invert(self, data):
ser = pd.Series(data, name="name")
try:
# 10 is an arbitrary choice here, just avoid iterating over
# the whole array to trim test runtime
[~x for x in data[:10]]
except TypeError:
# scalars don't support invert -> we don't expect the vectorized
# operation to succeed
with pytest.raises(TypeError):
~ser
with pytest.raises(TypeError):
~data
else:
# Note we do not reuse the pointwise result to construct expected
# because python semantics for negating bools are weird see GH#54569
result = ~ser
expected = pd.Series(~data, name="name")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
# the dunder __pos__ works if and only if np.positive works,
# same for __neg__/np.negative and __abs__/np.abs
attr = {np.positive: "__pos__", np.negative: "__neg__", np.abs: "__abs__"}[
ufunc
]
exc = None
try:
result = getattr(data, attr)()
except Exception as err:
exc = err
# if __pos__ raised, then so should the ufunc
with pytest.raises((type(exc), TypeError)):
ufunc(data)
else:
alt = ufunc(data)
tm.assert_extension_array_equal(result, alt)

View File

@ -0,0 +1,41 @@
import io
import pytest
import pandas as pd
class BasePrintingTests:
"""Tests checking the formatting of your EA when printed."""
@pytest.mark.parametrize("size", ["big", "small"])
def test_array_repr(self, data, size):
if size == "small":
data = data[:5]
else:
data = type(data)._concat_same_type([data] * 5)
result = repr(data)
assert type(data).__name__ in result
assert f"Length: {len(data)}" in result
assert str(data.dtype) in result
if size == "big":
assert "..." in result
def test_array_repr_unicode(self, data):
result = str(data)
assert isinstance(result, str)
def test_series_repr(self, data):
ser = pd.Series(data)
assert data.dtype.name in repr(ser)
def test_dataframe_repr(self, data):
df = pd.DataFrame({"A": data})
repr(df)
def test_dtype_name_in_info(self, data):
buf = io.StringIO()
pd.DataFrame({"A": data}).info(buf=buf)
result = buf.getvalue()
assert data.dtype.name in result

View File

@ -0,0 +1,153 @@
from typing import final
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_numeric_dtype
class BaseReduceTests:
"""
Reduction specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
# Specify if we expect this reduction to succeed.
return False
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
# We perform the same operation on the np.float64 data and check
# that the results match. Override if you need to cast to something
# other than float64.
res_op = getattr(ser, op_name)
try:
alt = ser.astype("float64")
except (TypeError, ValueError):
# e.g. Interval can't cast (TypeError), StringArray can't cast
# (ValueError), so let's cast to object and do
# the reduction pointwise
alt = ser.astype(object)
exp_op = getattr(alt, op_name)
if op_name == "count":
result = res_op()
expected = exp_op()
else:
result = res_op(skipna=skipna)
expected = exp_op(skipna=skipna)
tm.assert_almost_equal(result, expected)
def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
# Find the expected dtype when the given reduction is done on a DataFrame
# column with this array. The default assumes float64-like behavior,
# i.e. retains the dtype.
return arr.dtype
# We anticipate that authors should not need to override check_reduce_frame,
# but should be able to do any necessary overriding in
# _get_expected_reduction_dtype. If you have a use case where this
# does not hold, please let us know at github.com/pandas-dev/pandas/issues.
@final
def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
# Check that the 2D reduction done in a DataFrame reduction "looks like"
# a wrapped version of the 1D reduction done by Series.
arr = ser.array
df = pd.DataFrame({"a": arr})
kwargs = {"ddof": 1} if op_name in ["var", "std"] else {}
cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna)
# The DataFrame method just calls arr._reduce with keepdims=True,
# so this first check is perfunctory.
result1 = arr._reduce(op_name, skipna=skipna, keepdims=True, **kwargs)
result2 = getattr(df, op_name)(skipna=skipna, **kwargs).array
tm.assert_extension_array_equal(result1, result2)
# Check that the 2D reduction looks like a wrapped version of the
# 1D reduction
if not skipna and ser.isna().any():
expected = pd.array([pd.NA], dtype=cmp_dtype)
else:
exp_value = getattr(ser.dropna(), op_name)()
expected = pd.array([exp_value], dtype=cmp_dtype)
tm.assert_extension_array_equal(result1, expected)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
ser = pd.Series(data)
if not self._supports_reduction(ser, op_name):
# TODO: the message being checked here isn't actually checking anything
msg = (
"[Cc]annot perform|Categorical is not ordered for operation|"
"does not support reduction|"
)
with pytest.raises(TypeError, match=msg):
getattr(ser, op_name)(skipna=skipna)
else:
self.check_reduce(ser, op_name, skipna)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
ser = pd.Series(data)
if not self._supports_reduction(ser, op_name):
# TODO: the message being checked here isn't actually checking anything
msg = (
"[Cc]annot perform|Categorical is not ordered for operation|"
"does not support reduction|"
)
with pytest.raises(TypeError, match=msg):
getattr(ser, op_name)(skipna=skipna)
else:
# min/max with empty produce numpy warnings
self.check_reduce(ser, op_name, skipna)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_frame(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
ser = pd.Series(data)
if not is_numeric_dtype(ser.dtype):
pytest.skip(f"{ser.dtype} is not numeric dtype")
if op_name in ["count", "kurt", "sem"]:
pytest.skip(f"{op_name} not an array method")
if not self._supports_reduction(ser, op_name):
pytest.skip(f"Reduction {op_name} not supported for this dtype")
self.check_reduce_frame(ser, op_name, skipna)
# TODO(3.0): remove BaseNoReduceTests, BaseNumericReduceTests,
# BaseBooleanReduceTests
class BaseNoReduceTests(BaseReduceTests):
"""we don't define any reductions"""
class BaseNumericReduceTests(BaseReduceTests):
# For backward compatibility only, this only runs the numeric reductions
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
if op_name in ["any", "all"]:
pytest.skip("These are tested in BaseBooleanReduceTests")
return True
class BaseBooleanReduceTests(BaseReduceTests):
# For backward compatibility only, this only runs the numeric reductions
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
if op_name not in ["any", "all"]:
pytest.skip("These are tested in BaseNumericReduceTests")
return True

View File

@ -0,0 +1,379 @@
import itertools
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import ExtensionArray
from pandas.core.internals.blocks import EABackedBlock
class BaseReshapingTests:
"""Tests for reshaping and concatenation."""
@pytest.mark.parametrize("in_frame", [True, False])
def test_concat(self, data, in_frame):
wrapped = pd.Series(data)
if in_frame:
wrapped = pd.DataFrame(wrapped)
result = pd.concat([wrapped, wrapped], ignore_index=True)
assert len(result) == len(data) * 2
if in_frame:
dtype = result.dtypes[0]
else:
dtype = result.dtype
assert dtype == data.dtype
if hasattr(result._mgr, "blocks"):
assert isinstance(result._mgr.blocks[0], EABackedBlock)
assert isinstance(result._mgr.arrays[0], ExtensionArray)
@pytest.mark.parametrize("in_frame", [True, False])
def test_concat_all_na_block(self, data_missing, in_frame):
valid_block = pd.Series(data_missing.take([1, 1]), index=[0, 1])
na_block = pd.Series(data_missing.take([0, 0]), index=[2, 3])
if in_frame:
valid_block = pd.DataFrame({"a": valid_block})
na_block = pd.DataFrame({"a": na_block})
result = pd.concat([valid_block, na_block])
if in_frame:
expected = pd.DataFrame({"a": data_missing.take([1, 1, 0, 0])})
tm.assert_frame_equal(result, expected)
else:
expected = pd.Series(data_missing.take([1, 1, 0, 0]))
tm.assert_series_equal(result, expected)
def test_concat_mixed_dtypes(self, data):
# https://github.com/pandas-dev/pandas/issues/20762
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"A": [1, 2, 3]})
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
dfs = [df1, df2, df3]
# dataframes
result = pd.concat(dfs)
expected = pd.concat([x.astype(object) for x in dfs])
tm.assert_frame_equal(result, expected)
# series
result = pd.concat([x["A"] for x in dfs])
expected = pd.concat([x["A"].astype(object) for x in dfs])
tm.assert_series_equal(result, expected)
# simple test for just EA and one other
result = pd.concat([df1, df2.astype(object)])
expected = pd.concat([df1.astype("object"), df2.astype("object")])
tm.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["A"].astype(object)])
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
tm.assert_series_equal(result, expected)
def test_concat_columns(self, data, na_value):
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"B": [1, 2, 3]})
expected = pd.DataFrame({"A": data[:3], "B": [1, 2, 3]})
result = pd.concat([df1, df2], axis=1)
tm.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["B"]], axis=1)
tm.assert_frame_equal(result, expected)
# non-aligned
df2 = pd.DataFrame({"B": [1, 2, 3]}, index=[1, 2, 3])
expected = pd.DataFrame(
{
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
"B": [np.nan, 1, 2, 3],
}
)
result = pd.concat([df1, df2], axis=1)
tm.assert_frame_equal(result, expected)
result = pd.concat([df1["A"], df2["B"]], axis=1)
tm.assert_frame_equal(result, expected)
def test_concat_extension_arrays_copy_false(self, data, na_value):
# GH 20756
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"B": data[3:7]})
expected = pd.DataFrame(
{
"A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype),
"B": data[3:7],
}
)
result = pd.concat([df1, df2], axis=1, copy=False)
tm.assert_frame_equal(result, expected)
def test_concat_with_reindex(self, data):
# GH-33027
a = pd.DataFrame({"a": data[:5]})
b = pd.DataFrame({"b": data[:5]})
result = pd.concat([a, b], ignore_index=True)
expected = pd.DataFrame(
{
"a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
"b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
}
)
tm.assert_frame_equal(result, expected)
def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3]))
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.Series(data._from_sequence(list(a) + [na_value], dtype=data.dtype))
e2 = pd.Series(data._from_sequence([na_value] + list(b), dtype=data.dtype))
tm.assert_series_equal(r1, e1)
tm.assert_series_equal(r2, e2)
def test_align_frame(self, data, na_value):
a = data[:3]
b = data[2:5]
r1, r2 = pd.DataFrame({"A": a}).align(pd.DataFrame({"A": b}, index=[1, 2, 3]))
# Assumes that the ctor can take a list of scalars of the type
e1 = pd.DataFrame(
{"A": data._from_sequence(list(a) + [na_value], dtype=data.dtype)}
)
e2 = pd.DataFrame(
{"A": data._from_sequence([na_value] + list(b), dtype=data.dtype)}
)
tm.assert_frame_equal(r1, e1)
tm.assert_frame_equal(r2, e2)
def test_align_series_frame(self, data, na_value):
# https://github.com/pandas-dev/pandas/issues/20576
ser = pd.Series(data, name="a")
df = pd.DataFrame({"col": np.arange(len(ser) + 1)})
r1, r2 = ser.align(df)
e1 = pd.Series(
data._from_sequence(list(data) + [na_value], dtype=data.dtype),
name=ser.name,
)
tm.assert_series_equal(r1, e1)
tm.assert_frame_equal(r2, df)
def test_set_frame_expand_regular_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
df["B"] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
tm.assert_frame_equal(df, expected)
def test_set_frame_expand_extension_with_regular(self, data):
df = pd.DataFrame({"A": data})
df["B"] = [1] * len(data)
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
tm.assert_frame_equal(df, expected)
def test_set_frame_overwrite_object(self, data):
# https://github.com/pandas-dev/pandas/issues/20555
df = pd.DataFrame({"A": [1] * len(data)}, dtype=object)
df["A"] = data
assert df.dtypes["A"] == data.dtype
def test_merge(self, data, na_value):
# GH-20743
df1 = pd.DataFrame({"ext": data[:3], "int1": [1, 2, 3], "key": [0, 1, 2]})
df2 = pd.DataFrame({"int2": [1, 2, 3, 4], "key": [0, 0, 1, 3]})
res = pd.merge(df1, df2)
exp = pd.DataFrame(
{
"int1": [1, 1, 2],
"int2": [1, 2, 3],
"key": [0, 0, 1],
"ext": data._from_sequence(
[data[0], data[0], data[1]], dtype=data.dtype
),
}
)
tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
res = pd.merge(df1, df2, how="outer")
exp = pd.DataFrame(
{
"int1": [1, 1, 2, 3, np.nan],
"int2": [1, 2, 3, np.nan, 4],
"key": [0, 0, 1, 2, 3],
"ext": data._from_sequence(
[data[0], data[0], data[1], data[2], na_value], dtype=data.dtype
),
}
)
tm.assert_frame_equal(res, exp[["ext", "int1", "key", "int2"]])
def test_merge_on_extension_array(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b], dtype=data.dtype)
df = pd.DataFrame({"key": key, "val": [1, 2]})
result = pd.merge(df, df, on="key")
expected = pd.DataFrame({"key": key, "val_x": [1, 2], "val_y": [1, 2]})
tm.assert_frame_equal(result, expected)
# order
result = pd.merge(df.iloc[[1, 0]], df, on="key")
expected = expected.iloc[[1, 0]].reset_index(drop=True)
tm.assert_frame_equal(result, expected)
def test_merge_on_extension_array_duplicates(self, data):
# GH 23020
a, b = data[:2]
key = type(data)._from_sequence([a, b, a], dtype=data.dtype)
df1 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
df2 = pd.DataFrame({"key": key, "val": [1, 2, 3]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame(
{
"key": key.take([0, 0, 1, 2, 2]),
"val_x": [1, 1, 2, 3, 3],
"val_y": [1, 3, 2, 1, 3],
}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:The previous implementation of stack is deprecated"
)
@pytest.mark.parametrize(
"columns",
[
["A", "B"],
pd.MultiIndex.from_tuples(
[("A", "a"), ("A", "b")], names=["outer", "inner"]
),
],
)
@pytest.mark.parametrize("future_stack", [True, False])
def test_stack(self, data, columns, future_stack):
df = pd.DataFrame({"A": data[:5], "B": data[:5]})
df.columns = columns
result = df.stack(future_stack=future_stack)
expected = df.astype(object).stack(future_stack=future_stack)
# we need a second astype(object), in case the constructor inferred
# object -> specialized, as is done for period.
expected = expected.astype(object)
if isinstance(expected, pd.Series):
assert result.dtype == df.iloc[:, 0].dtype
else:
assert all(result.dtypes == df.iloc[:, 0].dtype)
result = result.astype(object)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"index",
[
# Two levels, uniform.
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
# non-uniform
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
# three levels, non-uniform
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
pd.MultiIndex.from_tuples(
[
("A", "a", 1),
("A", "b", 0),
("A", "a", 0),
("B", "a", 0),
("B", "c", 1),
]
),
],
)
@pytest.mark.parametrize("obj", ["series", "frame"])
def test_unstack(self, data, index, obj):
data = data[: len(index)]
if obj == "series":
ser = pd.Series(data, index=index)
else:
ser = pd.DataFrame({"A": data, "B": data}, index=index)
n = index.nlevels
levels = list(range(n))
# [0, 1, 2]
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
combinations = itertools.chain.from_iterable(
itertools.permutations(levels, i) for i in range(1, n)
)
for level in combinations:
result = ser.unstack(level=level)
assert all(
isinstance(result[col].array, type(data)) for col in result.columns
)
if obj == "series":
# We should get the same result with to_frame+unstack+droplevel
df = ser.to_frame()
alt = df.unstack(level=level).droplevel(0, axis=1)
tm.assert_frame_equal(result, alt)
obj_ser = ser.astype(object)
expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
if obj == "series":
assert (expected.dtypes == object).all()
result = result.astype(object)
tm.assert_frame_equal(result, expected)
def test_ravel(self, data):
# as long as EA is 1D-only, ravel is a no-op
result = data.ravel()
assert type(result) == type(data)
if data.dtype._is_immutable:
pytest.skip(f"test_ravel assumes mutability and {data.dtype} is immutable")
# Check that we have a view, not a copy
result[0] = result[1]
assert data[0] == data[1]
def test_transpose(self, data):
result = data.transpose()
assert type(result) == type(data)
# check we get a new object
assert result is not data
# If we ever _did_ support 2D, shape should be reversed
assert result.shape == data.shape[::-1]
if data.dtype._is_immutable:
pytest.skip(
f"test_transpose assumes mutability and {data.dtype} is immutable"
)
# Check that we have a view, not a copy
result[0] = result[1]
assert data[0] == data[1]
def test_transpose_frame(self, data):
df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
result = df.T
expected = pd.DataFrame(
{
"a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
"b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
"c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
"d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
},
index=["A", "B"],
)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(np.transpose(np.transpose(df)), df)
tm.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]])

View File

@ -0,0 +1,451 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
class BaseSetitemTests:
@pytest.fixture(
params=[
lambda x: x.index,
lambda x: list(x.index),
lambda x: slice(None),
lambda x: slice(0, len(x)),
lambda x: range(len(x)),
lambda x: list(range(len(x))),
lambda x: np.ones(len(x), dtype=bool),
],
ids=[
"index",
"list[index]",
"null_slice",
"full_slice",
"range",
"list(range)",
"mask",
],
)
def full_indexer(self, request):
"""
Fixture for an indexer to pass to obj.loc to get/set the full length of the
object.
In some cases, assumes that obj.index is the default RangeIndex.
"""
return request.param
@pytest.fixture(autouse=True)
def skip_if_immutable(self, dtype, request):
if dtype._is_immutable:
node = request.node
if node.name.split("[")[0] == "test_is_immutable":
# This fixture is auto-used, but we want to not-skip
# test_is_immutable.
return
# When BaseSetitemTests is mixed into ExtensionTests, we only
# want this fixture to operate on the tests defined in this
# class/file.
defined_in = node.function.__qualname__.split(".")[0]
if defined_in == "BaseSetitemTests":
pytest.skip("__setitem__ test not applicable with immutable dtype")
def test_is_immutable(self, data):
if data.dtype._is_immutable:
with pytest.raises(TypeError):
data[0] = data[0]
else:
data[0] = data[1]
assert data[0] == data[1]
def test_setitem_scalar_series(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
data[0] = data[1]
assert data[0] == data[1]
def test_setitem_sequence(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
original = data.copy()
data[[0, 1]] = [data[1], data[0]]
assert data[0] == original[1]
assert data[1] == original[0]
def test_setitem_sequence_mismatched_length_raises(self, data, as_array):
ser = pd.Series(data)
original = ser.copy()
value = [data[0]]
if as_array:
value = data._from_sequence(value, dtype=data.dtype)
xpr = "cannot set using a {} indexer with a different length"
with pytest.raises(ValueError, match=xpr.format("list-like")):
ser[[0, 1]] = value
# Ensure no modifications made before the exception
tm.assert_series_equal(ser, original)
with pytest.raises(ValueError, match=xpr.format("slice")):
ser[slice(3)] = value
tm.assert_series_equal(ser, original)
def test_setitem_empty_indexer(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
original = data.copy()
data[np.array([], dtype=int)] = []
tm.assert_equal(data, original)
def test_setitem_sequence_broadcasts(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
data[[0, 1]] = data[2]
assert data[0] == data[2]
assert data[1] == data[2]
@pytest.mark.parametrize("setter", ["loc", "iloc"])
def test_setitem_scalar(self, data, setter):
arr = pd.Series(data)
setter = getattr(arr, setter)
setter[0] = data[1]
assert arr[0] == data[1]
def test_setitem_loc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.loc[0, "B"] = data[1]
assert df.loc[0, "B"] == data[1]
def test_setitem_loc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.loc[10, "B"] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.loc[10, "B"] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_iloc_scalar_mixed(self, data):
df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
df.iloc[0, 1] = data[1]
assert df.loc[0, "B"] == data[1]
def test_setitem_iloc_scalar_single(self, data):
df = pd.DataFrame({"B": data})
df.iloc[10, 0] = data[1]
assert df.loc[10, "B"] == data[1]
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
df = pd.DataFrame({"A": data, "B": data})
df.iloc[10, 1] = data[1]
assert df.loc[10, "B"] == data[1]
@pytest.mark.parametrize(
"mask",
[
np.array([True, True, True, False, False]),
pd.array([True, True, True, False, False], dtype="boolean"),
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
],
ids=["numpy-array", "boolean-array", "boolean-array-na"],
)
def test_setitem_mask(self, data, mask, box_in_series):
arr = data[:5].copy()
expected = arr.take([0, 0, 0, 3, 4])
if box_in_series:
arr = pd.Series(arr)
expected = pd.Series(expected)
arr[mask] = data[0]
tm.assert_equal(expected, arr)
def test_setitem_mask_raises(self, data, box_in_series):
# wrong length
mask = np.array([True, False])
if box_in_series:
data = pd.Series(data)
with pytest.raises(IndexError, match="wrong length"):
data[mask] = data[0]
mask = pd.array(mask, dtype="boolean")
with pytest.raises(IndexError, match="wrong length"):
data[mask] = data[0]
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:3] = True
mask[3:5] = pd.NA
if box_in_series:
data = pd.Series(data)
data[mask] = data[0]
assert (data[:3] == data[0]).all()
@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_setitem_integer_array(self, data, idx, box_in_series):
arr = data[:5].copy()
expected = data.take([0, 0, 0, 3, 4])
if box_in_series:
arr = pd.Series(arr)
expected = pd.Series(expected)
arr[idx] = arr[0]
tm.assert_equal(arr, expected)
@pytest.mark.parametrize(
"idx, box_in_series",
[
([0, 1, 2, pd.NA], False),
pytest.param(
[0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
],
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
)
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
arr = data.copy()
# TODO(xfail) this raises KeyError about labels not found (it tries label-based)
# for list of labels with Series
if box_in_series:
arr = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
msg = "Cannot index with an integer indexer containing NA values"
with pytest.raises(ValueError, match=msg):
arr[idx] = arr[0]
@pytest.mark.parametrize("as_callable", [True, False])
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_aligned(self, data, as_callable, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if as_callable:
mask2 = lambda x: mask
else:
mask2 = mask
if setter:
# loc
target = getattr(ser, setter)
else:
# Series.__setitem__
target = ser
target[mask2] = data[5:7]
ser[mask2] = data[5:7]
assert ser[0] == data[5]
assert ser[1] == data[6]
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter):
ser = pd.Series(data)
mask = np.zeros(len(data), dtype=bool)
mask[:2] = True
if setter: # loc
target = getattr(ser, setter)
else: # __setitem__
target = ser
target[mask] = data[10]
assert ser[0] == data[10]
assert ser[1] == data[10]
def test_setitem_expand_columns(self, data):
df = pd.DataFrame({"A": data})
result = df.copy()
result["B"] = 1
expected = pd.DataFrame({"A": data, "B": [1] * len(data)})
tm.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, "B"] = 1
tm.assert_frame_equal(result, expected)
# overwrite with new type
result["B"] = data
expected = pd.DataFrame({"A": data, "B": data})
tm.assert_frame_equal(result, expected)
def test_setitem_expand_with_extension(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
result = df.copy()
result["B"] = data
expected = pd.DataFrame({"A": [1] * len(data), "B": data})
tm.assert_frame_equal(result, expected)
result = df.copy()
result.loc[:, "B"] = data
tm.assert_frame_equal(result, expected)
def test_setitem_frame_invalid_length(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
xpr = (
rf"Length of values \({len(data[:5])}\) "
rf"does not match length of index \({len(df)}\)"
)
with pytest.raises(ValueError, match=xpr):
df["B"] = data[:5]
def test_setitem_tuple_index(self, data):
ser = pd.Series(data[:2], index=[(0, 0), (0, 1)])
expected = pd.Series(data.take([1, 1]), index=ser.index)
ser[(0, 0)] = data[1]
tm.assert_series_equal(ser, expected)
def test_setitem_slice(self, data, box_in_series):
arr = data[:5].copy()
expected = data.take([0, 0, 0, 3, 4])
if box_in_series:
arr = pd.Series(arr)
expected = pd.Series(expected)
arr[:3] = data[0]
tm.assert_equal(arr, expected)
def test_setitem_loc_iloc_slice(self, data):
arr = data[:5].copy()
s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
result = s.copy()
result.iloc[:3] = data[0]
tm.assert_equal(result, expected)
result = s.copy()
result.loc[:"c"] = data[0]
tm.assert_equal(result, expected)
def test_setitem_slice_mismatch_length_raises(self, data):
arr = data[:5]
with pytest.raises(ValueError):
arr[:1] = arr[:2]
def test_setitem_slice_array(self, data):
arr = data[:5].copy()
arr[:5] = data[-5:]
tm.assert_extension_array_equal(arr, data[-5:])
def test_setitem_scalar_key_sequence_raise(self, data):
arr = data[:5].copy()
with pytest.raises(ValueError):
arr[0] = arr[[0, 1]]
def test_setitem_preserves_views(self, data):
# GH#28150 setitem shouldn't swap the underlying data
view1 = data.view()
view2 = data[:]
data[0] = data[1]
assert view1[0] == data[1]
assert view2[0] == data[1]
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
# https://github.com/pandas-dev/pandas/issues/32395
df = expected = pd.DataFrame({0: pd.Series(data)})
result = pd.DataFrame(index=df.index)
key = full_indexer(df)
result.loc[key, 0] = df[0]
tm.assert_frame_equal(result, expected)
def test_setitem_with_expansion_row(self, data, na_value):
df = pd.DataFrame({"data": data[:1]})
df.loc[1, "data"] = data[1]
expected = pd.DataFrame({"data": data[:2]})
tm.assert_frame_equal(df, expected)
# https://github.com/pandas-dev/pandas/issues/47284
df.loc[2, "data"] = na_value
expected = pd.DataFrame(
{"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)}
)
tm.assert_frame_equal(df, expected)
def test_setitem_series(self, data, full_indexer):
# https://github.com/pandas-dev/pandas/issues/32395
ser = pd.Series(data, name="data")
result = pd.Series(index=ser.index, dtype=object, name="data")
# because result has object dtype, the attempt to do setting inplace
# is successful, and object dtype is retained
key = full_indexer(ser)
result.loc[key] = ser
expected = pd.Series(
data.astype(object), index=ser.index, name="data", dtype=object
)
tm.assert_series_equal(result, expected)
def test_setitem_frame_2d_values(self, data):
# GH#44514
df = pd.DataFrame({"A": data})
# Avoiding using_array_manager fixture
# https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
using_copy_on_write = pd.options.mode.copy_on_write
blk_data = df._mgr.arrays[0]
orig = df.copy()
df.iloc[:] = df.copy()
tm.assert_frame_equal(df, orig)
df.iloc[:-1] = df.iloc[:-1].copy()
tm.assert_frame_equal(df, orig)
df.iloc[:] = df.values
tm.assert_frame_equal(df, orig)
if not using_array_manager and not using_copy_on_write:
# GH#33457 Check that this setting occurred in-place
# FIXME(ArrayManager): this should work there too
assert df._mgr.arrays[0] is blk_data
df.iloc[:-1] = df.values[:-1]
tm.assert_frame_equal(df, orig)
def test_delitem_series(self, data):
# GH#40763
ser = pd.Series(data, name="data")
taker = np.arange(len(ser))
taker = np.delete(taker, 1)
expected = ser[taker]
del ser[1]
tm.assert_series_equal(ser, expected)
def test_setitem_invalid(self, data, invalid_scalar):
msg = "" # messages vary by subclass, so we do not test it
with pytest.raises((ValueError, TypeError), match=msg):
data[0] = invalid_scalar
with pytest.raises((ValueError, TypeError), match=msg):
data[:] = invalid_scalar
def test_setitem_2d_values(self, data):
# GH50085
original = data.copy()
df = pd.DataFrame({"a": data, "b": data})
df.loc[[0, 1], :] = df.loc[[1, 0], :].values
assert (df.loc[0, :] == original[1]).all()
assert (df.loc[1, :] == original[0]).all()

View File

@ -0,0 +1,230 @@
import operator
import pytest
from pandas._config.config import _get_option
from pandas import (
Series,
options,
)
@pytest.fixture
def dtype():
"""A fixture providing the ExtensionDtype to validate."""
raise NotImplementedError
@pytest.fixture
def data():
"""
Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
raise NotImplementedError
@pytest.fixture
def data_for_twos(dtype):
"""
Length-100 array in which all the elements are two.
Call pytest.skip in your fixture if the dtype does not support divmod.
"""
if not (dtype._is_numeric or dtype.kind == "m"):
# Object-dtypes may want to allow this, but for the most part
# only numeric and timedelta-like dtypes will need to implement this.
pytest.skip(f"{dtype} is not a numeric dtype")
raise NotImplementedError
@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
raise NotImplementedError
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
@pytest.fixture
def data_repeated(data):
"""
Generate many datasets.
Parameters
----------
data : fixture implementing `data`
Returns
-------
Callable[[int], Generator]:
A callable that takes a `count` argument and
returns a generator yielding `count` datasets.
"""
def gen(count):
for _ in range(count):
yield data
return gen
@pytest.fixture
def data_for_sorting():
"""
Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
For boolean dtypes (for which there are only 2 values available),
set B=C=True
"""
raise NotImplementedError
@pytest.fixture
def data_missing_for_sorting():
"""
Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
raise NotImplementedError
@pytest.fixture
def na_cmp():
"""
Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.is_``
"""
return operator.is_
@pytest.fixture
def na_value(dtype):
"""
The scalar missing value for this type. Default dtype.na_value.
TODO: can be removed in 3.x (see https://github.com/pandas-dev/pandas/pull/54930)
"""
return dtype.na_value
@pytest.fixture
def data_for_grouping():
"""
Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing.
If a dtype has _is_boolean = True, i.e. only 2 unique non-NA entries,
then set C=B.
"""
raise NotImplementedError
@pytest.fixture(params=[True, False])
def box_in_series(request):
"""Whether to box the data in a Series"""
return request.param
@pytest.fixture(
params=[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def groupby_apply_op(request):
"""
Functions to test groupby.apply().
"""
return request.param
@pytest.fixture(params=[True, False])
def as_frame(request):
"""
Boolean fixture to support Series and Series.to_frame() comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_series(request):
"""
Boolean fixture to support arr and Series(arr) comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def use_numpy(request):
"""
Boolean fixture to support comparison testing of ExtensionDtype array
and numpy array.
"""
return request.param
@pytest.fixture(params=["ffill", "bfill"])
def fillna_method(request):
"""
Parametrized fixture giving method parameters 'ffill' and 'bfill' for
Series.fillna(method=<method>) testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_array(request):
"""
Boolean fixture to support ExtensionDtype _from_sequence method testing.
"""
return request.param
@pytest.fixture
def invalid_scalar(data):
"""
A scalar that *cannot* be held by this ExtensionArray.
The default should work for most subclasses, but is not guaranteed.
If the array can hold any item (i.e. object dtype), then use pytest.skip.
"""
return object.__new__(object)
@pytest.fixture
def using_copy_on_write() -> bool:
"""
Fixture to check if Copy-on-Write is enabled.
"""
return (
options.mode.copy_on_write is True
and _get_option("mode.data_manager", silent=True) == "block"
)

View File

@ -0,0 +1,6 @@
from pandas.tests.extension.date.array import (
DateArray,
DateDtype,
)
__all__ = ["DateArray", "DateDtype"]

View File

@ -0,0 +1,188 @@
from __future__ import annotations
import datetime as dt
from typing import (
TYPE_CHECKING,
Any,
cast,
)
import numpy as np
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
)
from pandas.api.types import pandas_dtype
if TYPE_CHECKING:
from collections.abc import Sequence
from pandas._typing import (
Dtype,
PositionalIndexer,
)
@register_extension_dtype
class DateDtype(ExtensionDtype):
@property
def type(self):
return dt.date
@property
def name(self):
return "DateDtype"
@classmethod
def construct_from_string(cls, string: str):
if not isinstance(string, str):
raise TypeError(
f"'construct_from_string' expects a string, got {type(string)}"
)
if string == cls.__name__:
return cls()
else:
raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
@classmethod
def construct_array_type(cls):
return DateArray
@property
def na_value(self):
return dt.date.min
def __repr__(self) -> str:
return self.name
class DateArray(ExtensionArray):
def __init__(
self,
dates: (
dt.date
| Sequence[dt.date]
| tuple[np.ndarray, np.ndarray, np.ndarray]
| np.ndarray
),
) -> None:
if isinstance(dates, dt.date):
self._year = np.array([dates.year])
self._month = np.array([dates.month])
self._day = np.array([dates.year])
return
ldates = len(dates)
if isinstance(dates, list):
# pre-allocate the arrays since we know the size before hand
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
# populate them
for i, (y, m, d) in enumerate(
(date.year, date.month, date.day) for date in dates
):
self._year[i] = y
self._month[i] = m
self._day[i] = d
elif isinstance(dates, tuple):
# only support triples
if ldates != 3:
raise ValueError("only triples are valid")
# check if all elements have the same type
if any(not isinstance(x, np.ndarray) for x in dates):
raise TypeError("invalid type")
ly, lm, ld = (len(cast(np.ndarray, d)) for d in dates)
if not ly == lm == ld:
raise ValueError(
f"tuple members must have the same length: {(ly, lm, ld)}"
)
self._year = dates[0].astype(np.uint16)
self._month = dates[1].astype(np.uint8)
self._day = dates[2].astype(np.uint8)
elif isinstance(dates, np.ndarray) and dates.dtype == "U10":
self._year = np.zeros(ldates, dtype=np.uint16) # 65535 (0, 9999)
self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31)
self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12)
# error: "object_" object is not iterable
obj = np.char.split(dates, sep="-")
for (i,), (y, m, d) in np.ndenumerate(obj): # type: ignore[misc]
self._year[i] = int(y)
self._month[i] = int(m)
self._day[i] = int(d)
else:
raise TypeError(f"{type(dates)} is not supported")
@property
def dtype(self) -> ExtensionDtype:
return DateDtype()
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if isinstance(dtype, DateDtype):
data = self.copy() if copy else self
else:
data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)
return data
@property
def nbytes(self) -> int:
return self._year.nbytes + self._month.nbytes + self._day.nbytes
def __len__(self) -> int:
return len(self._year) # all 3 arrays are enforced to have the same length
def __getitem__(self, item: PositionalIndexer):
if isinstance(item, int):
return dt.date(self._year[item], self._month[item], self._day[item])
else:
raise NotImplementedError("only ints are supported as indexes")
def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
if not isinstance(key, int):
raise NotImplementedError("only ints are supported as indexes")
if not isinstance(value, dt.date):
raise TypeError("you can only set datetime.date types")
self._year[key] = value.year
self._month[key] = value.month
self._day[key] = value.day
def __repr__(self) -> str:
return f"DateArray{list(zip(self._year, self._month, self._day))}"
def copy(self) -> DateArray:
return DateArray((self._year.copy(), self._month.copy(), self._day.copy()))
def isna(self) -> np.ndarray:
return np.logical_and(
np.logical_and(
self._year == dt.date.min.year, self._month == dt.date.min.month
),
self._day == dt.date.min.day,
)
@classmethod
def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
if isinstance(scalars, dt.date):
raise TypeError
elif isinstance(scalars, DateArray):
if dtype is not None:
return scalars.astype(dtype, copy=copy)
if copy:
return scalars.copy()
return scalars[:]
elif isinstance(scalars, np.ndarray):
scalars = scalars.astype("U10") # 10 chars for yyyy-mm-dd
return DateArray(scalars)

View File

@ -0,0 +1,8 @@
from pandas.tests.extension.decimal.array import (
DecimalArray,
DecimalDtype,
make_data,
to_decimal,
)
__all__ = ["DecimalArray", "DecimalDtype", "to_decimal", "make_data"]

View File

@ -0,0 +1,311 @@
from __future__ import annotations
import decimal
import numbers
import sys
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.common import (
is_dtype_equal,
is_float,
is_integer,
pandas_dtype,
)
import pandas as pd
from pandas.api.extensions import (
no_default,
register_extension_dtype,
)
from pandas.api.types import (
is_list_like,
is_scalar,
)
from pandas.core import arraylike
from pandas.core.algorithms import value_counts_internal as value_counts
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import (
ExtensionArray,
ExtensionScalarOpsMixin,
)
from pandas.core.indexers import check_array_indexer
if TYPE_CHECKING:
from pandas._typing import type_t
@register_extension_dtype
class DecimalDtype(ExtensionDtype):
type = decimal.Decimal
name = "decimal"
na_value = decimal.Decimal("NaN")
_metadata = ("context",)
def __init__(self, context=None) -> None:
self.context = context or decimal.getcontext()
def __repr__(self) -> str:
return f"DecimalDtype(context={self.context})"
@classmethod
def construct_array_type(cls) -> type_t[DecimalArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return DecimalArray
@property
def _is_numeric(self) -> bool:
return True
class DecimalArray(OpsMixin, ExtensionScalarOpsMixin, ExtensionArray):
__array_priority__ = 1000
def __init__(self, values, dtype=None, copy=False, context=None) -> None:
for i, val in enumerate(values):
if is_float(val) or is_integer(val):
if np.isnan(val):
values[i] = DecimalDtype.na_value
else:
# error: Argument 1 has incompatible type "float | int |
# integer[Any]"; expected "Decimal | float | str | tuple[int,
# Sequence[int], int]"
values[i] = DecimalDtype.type(val) # type: ignore[arg-type]
elif not isinstance(val, decimal.Decimal):
raise TypeError("All values must be of type " + str(decimal.Decimal))
values = np.asarray(values, dtype=object)
self._data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self.data = self._data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
self._dtype = DecimalDtype(context)
@property
def dtype(self):
return self._dtype
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
return cls(scalars)
@classmethod
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
return cls._from_sequence(
[decimal.Decimal(x) for x in strings], dtype=dtype, copy=copy
)
@classmethod
def _from_factorized(cls, values, original):
return cls(values)
_HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray)
def to_numpy(
self,
dtype=None,
copy: bool = False,
na_value: object = no_default,
decimals=None,
) -> np.ndarray:
result = np.asarray(self, dtype=dtype)
if decimals is not None:
result = np.asarray([round(x, decimals) for x in result])
return result
def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
#
if not all(
isinstance(t, self._HANDLED_TYPES + (DecimalArray,)) for t in inputs
):
return NotImplemented
result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
# e.g. test_array_ufunc_series_scalar_other
return result
if "out" in kwargs:
return arraylike.dispatch_ufunc_with_out(
self, ufunc, method, *inputs, **kwargs
)
inputs = tuple(x._data if isinstance(x, DecimalArray) else x for x in inputs)
result = getattr(ufunc, method)(*inputs, **kwargs)
if method == "reduce":
result = arraylike.dispatch_reduction_ufunc(
self, ufunc, method, *inputs, **kwargs
)
if result is not NotImplemented:
return result
def reconstruct(x):
if isinstance(x, (decimal.Decimal, numbers.Number)):
return x
else:
return type(self)._from_sequence(x, dtype=self.dtype)
if ufunc.nout > 1:
return tuple(reconstruct(x) for x in result)
else:
return reconstruct(result)
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self._data[item]
else:
# array, slice.
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self._data[item])
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take
data = self._data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return self._from_sequence(result, dtype=self.dtype)
def copy(self):
return type(self)(self._data.copy(), dtype=self.dtype)
def astype(self, dtype, copy=True):
if is_dtype_equal(dtype, self._dtype):
if not copy:
return self
dtype = pandas_dtype(dtype)
if isinstance(dtype, type(self.dtype)):
return type(self)(self._data, copy=copy, context=dtype.context)
return super().astype(dtype, copy=copy)
def __setitem__(self, key, value) -> None:
if is_list_like(value):
if is_scalar(key):
raise ValueError("setting an array element with a sequence.")
value = [decimal.Decimal(v) for v in value]
else:
value = decimal.Decimal(value)
key = check_array_indexer(self, key)
self._data[key] = value
def __len__(self) -> int:
return len(self._data)
def __contains__(self, item) -> bool | np.bool_:
if not isinstance(item, decimal.Decimal):
return False
elif item.is_nan():
return self.isna().any()
else:
return super().__contains__(item)
@property
def nbytes(self) -> int:
n = len(self)
if n:
return n * sys.getsizeof(self[0])
return 0
def isna(self):
return np.array([x.is_nan() for x in self._data], dtype=bool)
@property
def _na_value(self):
return decimal.Decimal("NaN")
def _formatter(self, boxed=False):
if boxed:
return "Decimal: {}".format
return repr
@classmethod
def _concat_same_type(cls, to_concat):
return cls(np.concatenate([x._data for x in to_concat]))
def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
if skipna and self.isna().any():
# If we don't have any NAs, we can ignore skipna
other = self[~self.isna()]
result = other._reduce(name, **kwargs)
elif name == "sum" and len(self) == 0:
# GH#29630 avoid returning int 0 or np.bool_(False) on old numpy
result = decimal.Decimal(0)
else:
try:
op = getattr(self.data, name)
except AttributeError as err:
raise NotImplementedError(
f"decimal does not support the {name} operation"
) from err
result = op(axis=0)
if keepdims:
return type(self)([result])
else:
return result
def _cmp_method(self, other, op):
# For use with OpsMixin
def convert_values(param):
if isinstance(param, ExtensionArray) or is_list_like(param):
ovalues = param
else:
# Assume it's an object
ovalues = [param] * len(self)
return ovalues
lvalues = self
rvalues = convert_values(other)
# If the operator is not defined for the underlying objects,
# a TypeError should be raised
res = [op(a, b) for (a, b) in zip(lvalues, rvalues)]
return np.asarray(res, dtype=bool)
def value_counts(self, dropna: bool = True):
return value_counts(self.to_numpy(), dropna=dropna)
# We override fillna here to simulate a 3rd party EA that has done so. This
# lets us test the deprecation telling authors to implement _pad_or_backfill
# Simulate a 3rd-party EA that has not yet updated to include a "copy"
# keyword in its fillna method.
# error: Signature of "fillna" incompatible with supertype "ExtensionArray"
def fillna( # type: ignore[override]
self,
value=None,
method=None,
limit: int | None = None,
):
return super().fillna(value=value, method=method, limit=limit, copy=True)
def to_decimal(values, context=None):
return DecimalArray([decimal.Decimal(x) for x in values], context=context)
def make_data():
return [decimal.Decimal(val) for val in np.random.default_rng(2).random(100)]
DecimalArray._add_arithmetic_ops()

View File

@ -0,0 +1,567 @@
from __future__ import annotations
import decimal
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension import base
from pandas.tests.extension.decimal.array import (
DecimalArray,
DecimalDtype,
make_data,
to_decimal,
)
@pytest.fixture
def dtype():
return DecimalDtype()
@pytest.fixture
def data():
return DecimalArray(make_data())
@pytest.fixture
def data_for_twos():
return DecimalArray([decimal.Decimal(2) for _ in range(100)])
@pytest.fixture
def data_missing():
return DecimalArray([decimal.Decimal("NaN"), decimal.Decimal(1)])
@pytest.fixture
def data_for_sorting():
return DecimalArray(
[decimal.Decimal("1"), decimal.Decimal("2"), decimal.Decimal("0")]
)
@pytest.fixture
def data_missing_for_sorting():
return DecimalArray(
[decimal.Decimal("1"), decimal.Decimal("NaN"), decimal.Decimal("0")]
)
@pytest.fixture
def na_cmp():
return lambda x, y: x.is_nan() and y.is_nan()
@pytest.fixture
def data_for_grouping():
b = decimal.Decimal("1.0")
a = decimal.Decimal("0.0")
c = decimal.Decimal("2.0")
na = decimal.Decimal("NaN")
return DecimalArray([b, b, na, na, a, a, b, c])
class TestDecimalArray(base.ExtensionTests):
def _get_expected_exception(
self, op_name: str, obj, other
) -> type[Exception] | None:
return None
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return True
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
if op_name == "count":
return super().check_reduce(ser, op_name, skipna)
else:
result = getattr(ser, op_name)(skipna=skipna)
expected = getattr(np.asarray(ser), op_name)()
tm.assert_almost_equal(result, expected)
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
if all_numeric_reductions in ["kurt", "skew", "sem", "median"]:
mark = pytest.mark.xfail(raises=NotImplementedError)
request.applymarker(mark)
super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
op_name = all_numeric_reductions
if op_name in ["skew", "median"]:
mark = pytest.mark.xfail(raises=NotImplementedError)
request.applymarker(mark)
return super().test_reduce_frame(data, all_numeric_reductions, skipna)
def test_compare_scalar(self, data, comparison_op):
ser = pd.Series(data)
self._compare_other(ser, data, comparison_op, 0.5)
def test_compare_array(self, data, comparison_op):
ser = pd.Series(data)
alter = np.random.default_rng(2).choice([-1, 0, 1], len(data))
# Randomly double, halve or keep same value
other = pd.Series(data) * [decimal.Decimal(pow(2.0, i)) for i in alter]
self._compare_other(ser, data, comparison_op, other)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
op_name = all_arithmetic_operators
ser = pd.Series(data)
context = decimal.getcontext()
divbyzerotrap = context.traps[decimal.DivisionByZero]
invalidoptrap = context.traps[decimal.InvalidOperation]
context.traps[decimal.DivisionByZero] = 0
context.traps[decimal.InvalidOperation] = 0
# Decimal supports ops with int, but not float
other = pd.Series([int(d * 100) for d in data])
self.check_opname(ser, op_name, other)
if "mod" not in op_name:
self.check_opname(ser, op_name, ser * 2)
self.check_opname(ser, op_name, 0)
self.check_opname(ser, op_name, 5)
context.traps[decimal.DivisionByZero] = divbyzerotrap
context.traps[decimal.InvalidOperation] = invalidoptrap
def test_fillna_frame(self, data_missing):
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_frame(data_missing)
def test_fillna_limit_pad(self, data_missing):
msg = "ExtensionArray.fillna 'method' keyword is deprecated"
with tm.assert_produces_warning(
DeprecationWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
super().test_fillna_limit_pad(data_missing)
msg = "The 'method' keyword in DecimalArray.fillna is deprecated"
with tm.assert_produces_warning(
FutureWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
super().test_fillna_limit_pad(data_missing)
@pytest.mark.parametrize(
"limit_area, input_ilocs, expected_ilocs",
[
("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
],
)
def test_ffill_limit_area(
self, data_missing, limit_area, input_ilocs, expected_ilocs
):
# GH#56616
msg = "ExtensionArray.fillna 'method' keyword is deprecated"
with tm.assert_produces_warning(
DeprecationWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
msg = "DecimalArray does not implement limit_area"
with pytest.raises(NotImplementedError, match=msg):
super().test_ffill_limit_area(
data_missing, limit_area, input_ilocs, expected_ilocs
)
def test_fillna_limit_backfill(self, data_missing):
msg = "Series.fillna with 'method' is deprecated"
with tm.assert_produces_warning(
FutureWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
super().test_fillna_limit_backfill(data_missing)
msg = "ExtensionArray.fillna 'method' keyword is deprecated"
with tm.assert_produces_warning(
DeprecationWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
super().test_fillna_limit_backfill(data_missing)
msg = "The 'method' keyword in DecimalArray.fillna is deprecated"
with tm.assert_produces_warning(
FutureWarning,
match=msg,
check_stacklevel=False,
raise_on_extra_warnings=False,
):
super().test_fillna_limit_backfill(data_missing)
def test_fillna_no_op_returns_copy(self, data):
msg = "|".join(
[
"ExtensionArray.fillna 'method' keyword is deprecated",
"The 'method' keyword in DecimalArray.fillna is deprecated",
]
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False
):
super().test_fillna_no_op_returns_copy(data)
def test_fillna_series(self, data_missing):
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, check_stacklevel=False
):
super().test_fillna_series(data_missing)
def test_fillna_series_method(self, data_missing, fillna_method):
msg = "|".join(
[
"ExtensionArray.fillna 'method' keyword is deprecated",
"The 'method' keyword in DecimalArray.fillna is deprecated",
]
)
with tm.assert_produces_warning(
(FutureWarning, DeprecationWarning), match=msg, check_stacklevel=False
):
super().test_fillna_series_method(data_missing, fillna_method)
def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
warn = DeprecationWarning if not using_copy_on_write else None
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
super().test_fillna_copy_frame(data_missing)
def test_fillna_copy_series(self, data_missing, using_copy_on_write):
warn = DeprecationWarning if not using_copy_on_write else None
msg = "ExtensionArray.fillna added a 'copy' keyword"
with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
super().test_fillna_copy_series(data_missing)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
all_data = all_data[:10]
if dropna:
other = np.array(all_data[~all_data.isna()])
else:
other = all_data
vcs = pd.Series(all_data).value_counts(dropna=dropna)
vcs_ex = pd.Series(other).value_counts(dropna=dropna)
with decimal.localcontext() as ctx:
# avoid raising when comparing Decimal("NAN") < Decimal(2)
ctx.traps[decimal.InvalidOperation] = False
result = vcs.sort_index()
expected = vcs_ex.sort_index()
tm.assert_series_equal(result, expected)
def test_series_repr(self, data):
# Overriding this base test to explicitly test that
# the custom _formatter is used
ser = pd.Series(data)
assert data.dtype.name in repr(ser)
assert "Decimal: " in repr(ser)
@pytest.mark.xfail(reason="Inconsistent array-vs-scalar behavior")
@pytest.mark.parametrize("ufunc", [np.positive, np.negative, np.abs])
def test_unary_ufunc_dunder_equivalence(self, data, ufunc):
super().test_unary_ufunc_dunder_equivalence(data, ufunc)
def test_take_na_value_other_decimal():
arr = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
result = arr.take([0, -1], allow_fill=True, fill_value=decimal.Decimal("-1.0"))
expected = DecimalArray([decimal.Decimal("1.0"), decimal.Decimal("-1.0")])
tm.assert_extension_array_equal(result, expected)
def test_series_constructor_coerce_data_to_extension_dtype():
dtype = DecimalDtype()
ser = pd.Series([0, 1, 2], dtype=dtype)
arr = DecimalArray(
[decimal.Decimal(0), decimal.Decimal(1), decimal.Decimal(2)],
dtype=dtype,
)
exp = pd.Series(arr)
tm.assert_series_equal(ser, exp)
def test_series_constructor_with_dtype():
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.Series(arr, dtype=DecimalDtype())
expected = pd.Series(arr)
tm.assert_series_equal(result, expected)
result = pd.Series(arr, dtype="int64")
expected = pd.Series([10])
tm.assert_series_equal(result, expected)
def test_dataframe_constructor_with_dtype():
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.DataFrame({"A": arr}, dtype=DecimalDtype())
expected = pd.DataFrame({"A": arr})
tm.assert_frame_equal(result, expected)
arr = DecimalArray([decimal.Decimal("10.0")])
result = pd.DataFrame({"A": arr}, dtype="int64")
expected = pd.DataFrame({"A": [10]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("frame", [True, False])
def test_astype_dispatches(frame):
# This is a dtype-specific test that ensures Series[decimal].astype
# gets all the way through to ExtensionArray.astype
# Designing a reliable smoke test that works for arbitrary data types
# is difficult.
data = pd.Series(DecimalArray([decimal.Decimal(2)]), name="a")
ctx = decimal.Context()
ctx.prec = 5
if frame:
data = data.to_frame()
result = data.astype(DecimalDtype(ctx))
if frame:
result = result["a"]
assert result.dtype.context.prec == ctx.prec
class DecimalArrayWithoutFromSequence(DecimalArray):
"""Helper class for testing error handling in _from_sequence."""
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
raise KeyError("For the test")
class DecimalArrayWithoutCoercion(DecimalArrayWithoutFromSequence):
@classmethod
def _create_arithmetic_method(cls, op):
return cls._create_method(op, coerce_to_dtype=False)
DecimalArrayWithoutCoercion._add_arithmetic_ops()
def test_combine_from_sequence_raises(monkeypatch):
# https://github.com/pandas-dev/pandas/issues/22850
cls = DecimalArrayWithoutFromSequence
@classmethod
def construct_array_type(cls):
return DecimalArrayWithoutFromSequence
monkeypatch.setattr(DecimalDtype, "construct_array_type", construct_array_type)
arr = cls([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
ser = pd.Series(arr)
result = ser.combine(ser, operator.add)
# note: object dtype
expected = pd.Series(
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"class_", [DecimalArrayWithoutFromSequence, DecimalArrayWithoutCoercion]
)
def test_scalar_ops_from_sequence_raises(class_):
# op(EA, EA) should return an EA, or an ndarray if it's not possible
# to return an EA with the return values.
arr = class_([decimal.Decimal("1.0"), decimal.Decimal("2.0")])
result = arr + arr
expected = np.array(
[decimal.Decimal("2.0"), decimal.Decimal("4.0")], dtype="object"
)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
"reverse, expected_div, expected_mod",
[(False, [0, 1, 1, 2], [1, 0, 1, 0]), (True, [2, 1, 0, 0], [0, 0, 2, 2])],
)
def test_divmod_array(reverse, expected_div, expected_mod):
# https://github.com/pandas-dev/pandas/issues/22930
arr = to_decimal([1, 2, 3, 4])
if reverse:
div, mod = divmod(2, arr)
else:
div, mod = divmod(arr, 2)
expected_div = to_decimal(expected_div)
expected_mod = to_decimal(expected_mod)
tm.assert_extension_array_equal(div, expected_div)
tm.assert_extension_array_equal(mod, expected_mod)
def test_ufunc_fallback(data):
a = data[:5]
s = pd.Series(a, index=range(3, 8))
result = np.abs(s)
expected = pd.Series(np.abs(a), index=range(3, 8))
tm.assert_series_equal(result, expected)
def test_array_ufunc():
a = to_decimal([1, 2, 3])
result = np.exp(a)
expected = to_decimal(np.exp(a._data))
tm.assert_extension_array_equal(result, expected)
def test_array_ufunc_series():
a = to_decimal([1, 2, 3])
s = pd.Series(a)
result = np.exp(s)
expected = pd.Series(to_decimal(np.exp(a._data)))
tm.assert_series_equal(result, expected)
def test_array_ufunc_series_scalar_other():
# check _HANDLED_TYPES
a = to_decimal([1, 2, 3])
s = pd.Series(a)
result = np.add(s, decimal.Decimal(1))
expected = pd.Series(np.add(a, decimal.Decimal(1)))
tm.assert_series_equal(result, expected)
def test_array_ufunc_series_defer():
a = to_decimal([1, 2, 3])
s = pd.Series(a)
expected = pd.Series(to_decimal([2, 4, 6]))
r1 = np.add(s, a)
r2 = np.add(a, s)
tm.assert_series_equal(r1, expected)
tm.assert_series_equal(r2, expected)
def test_groupby_agg():
# Ensure that the result of agg is inferred to be decimal dtype
# https://github.com/pandas-dev/pandas/issues/29141
data = make_data()[:5]
df = pd.DataFrame(
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
)
# single key, selected column
expected = pd.Series(to_decimal([data[0], data[3]]))
result = df.groupby("id1")["decimals"].agg(lambda x: x.iloc[0])
tm.assert_series_equal(result, expected, check_names=False)
result = df["decimals"].groupby(df["id1"]).agg(lambda x: x.iloc[0])
tm.assert_series_equal(result, expected, check_names=False)
# multiple keys, selected column
expected = pd.Series(
to_decimal([data[0], data[1], data[3]]),
index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 1)]),
)
result = df.groupby(["id1", "id2"])["decimals"].agg(lambda x: x.iloc[0])
tm.assert_series_equal(result, expected, check_names=False)
result = df["decimals"].groupby([df["id1"], df["id2"]]).agg(lambda x: x.iloc[0])
tm.assert_series_equal(result, expected, check_names=False)
# multiple columns
expected = pd.DataFrame({"id2": [0, 1], "decimals": to_decimal([data[0], data[3]])})
result = df.groupby("id1").agg(lambda x: x.iloc[0])
tm.assert_frame_equal(result, expected, check_names=False)
def test_groupby_agg_ea_method(monkeypatch):
# Ensure that the result of agg is inferred to be decimal dtype
# https://github.com/pandas-dev/pandas/issues/29141
def DecimalArray__my_sum(self):
return np.sum(np.array(self))
monkeypatch.setattr(DecimalArray, "my_sum", DecimalArray__my_sum, raising=False)
data = make_data()[:5]
df = pd.DataFrame({"id": [0, 0, 0, 1, 1], "decimals": DecimalArray(data)})
expected = pd.Series(to_decimal([data[0] + data[1] + data[2], data[3] + data[4]]))
result = df.groupby("id")["decimals"].agg(lambda x: x.values.my_sum())
tm.assert_series_equal(result, expected, check_names=False)
s = pd.Series(DecimalArray(data))
grouper = np.array([0, 0, 0, 1, 1], dtype=np.int64)
result = s.groupby(grouper).agg(lambda x: x.values.my_sum())
tm.assert_series_equal(result, expected, check_names=False)
def test_indexing_no_materialize(monkeypatch):
# See https://github.com/pandas-dev/pandas/issues/29708
# Ensure that indexing operations do not materialize (convert to a numpy
# array) the ExtensionArray unnecessary
def DecimalArray__array__(self, dtype=None):
raise Exception("tried to convert a DecimalArray to a numpy array")
monkeypatch.setattr(DecimalArray, "__array__", DecimalArray__array__, raising=False)
data = make_data()
s = pd.Series(DecimalArray(data))
df = pd.DataFrame({"a": s, "b": range(len(s))})
# ensure the following operations do not raise an error
s[s > 0.5]
df[s > 0.5]
s.at[0]
df.at[0, "a"]
def test_to_numpy_keyword():
# test the extra keyword
values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")]
expected = np.array(
[decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object"
)
a = pd.array(values, dtype="decimal")
result = a.to_numpy(decimals=2)
tm.assert_numpy_array_equal(result, expected)
result = pd.Series(a).to_numpy(decimals=2)
tm.assert_numpy_array_equal(result, expected)
def test_array_copy_on_write(using_copy_on_write):
df = pd.DataFrame({"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype="object")
df2 = df.astype(DecimalDtype())
df.iloc[0, 0] = 0
if using_copy_on_write:
expected = pd.DataFrame(
{"a": [decimal.Decimal(2), decimal.Decimal(3)]}, dtype=DecimalDtype()
)
tm.assert_equal(df2.values, expected.values)

View File

@ -0,0 +1,7 @@
from pandas.tests.extension.json.array import (
JSONArray,
JSONDtype,
make_data,
)
__all__ = ["JSONArray", "JSONDtype", "make_data"]

View File

@ -0,0 +1,256 @@
"""
Test extension array for storing nested data in a pandas container.
The JSONArray stores lists of dictionaries. The storage mechanism is a list,
not an ndarray.
Note
----
We currently store lists of UserDicts. Pandas has a few places
internally that specifically check for dicts, and does non-scalar things
in that case. We *want* the dictionaries to be treated as scalars, so we
hack around pandas by using UserDicts.
"""
from __future__ import annotations
from collections import (
UserDict,
abc,
)
import itertools
import numbers
import string
import sys
from typing import (
TYPE_CHECKING,
Any,
)
import numpy as np
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
from pandas.core.dtypes.common import (
is_bool_dtype,
is_list_like,
pandas_dtype,
)
import pandas as pd
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
)
from pandas.core.indexers import unpack_tuple_and_ellipses
if TYPE_CHECKING:
from collections.abc import Mapping
from pandas._typing import type_t
class JSONDtype(ExtensionDtype):
type = abc.Mapping
name = "json"
na_value: Mapping[str, Any] = UserDict()
@classmethod
def construct_array_type(cls) -> type_t[JSONArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return JSONArray
class JSONArray(ExtensionArray):
dtype = JSONDtype()
__array_priority__ = 1000
def __init__(self, values, dtype=None, copy=False) -> None:
for val in values:
if not isinstance(val, self.dtype.type):
raise TypeError("All values must be of type " + str(self.dtype.type))
self.data = values
# Some aliases for common attribute names to ensure pandas supports
# these
self._items = self._data = self.data
# those aliases are currently not working due to assumptions
# in internal code (GH-20735)
# self._values = self.values = self.data
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
return cls(scalars)
@classmethod
def _from_factorized(cls, values, original):
return cls([UserDict(x) for x in values if x != ()])
def __getitem__(self, item):
if isinstance(item, tuple):
item = unpack_tuple_and_ellipses(item)
if isinstance(item, numbers.Integral):
return self.data[item]
elif isinstance(item, slice) and item == slice(None):
# Make sure we get a view
return type(self)(self.data)
elif isinstance(item, slice):
# slice
return type(self)(self.data[item])
elif not is_list_like(item):
# e.g. "foo" or 2.5
# exception message copied from numpy
raise IndexError(
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
r"(`None`) and integer or boolean arrays are valid indices"
)
else:
item = pd.api.indexers.check_array_indexer(self, item)
if is_bool_dtype(item.dtype):
return type(self)._from_sequence(
[x for x, m in zip(self, item) if m], dtype=self.dtype
)
# integer
return type(self)([self.data[i] for i in item])
def __setitem__(self, key, value) -> None:
if isinstance(key, numbers.Integral):
self.data[key] = value
else:
if not isinstance(value, (type(self), abc.Sequence)):
# broadcast value
value = itertools.cycle([value])
if isinstance(key, np.ndarray) and key.dtype == "bool":
# masking
for i, (k, v) in enumerate(zip(key, value)):
if k:
assert isinstance(v, self.dtype.type)
self.data[i] = v
else:
for k, v in zip(key, value):
assert isinstance(v, self.dtype.type)
self.data[k] = v
def __len__(self) -> int:
return len(self.data)
def __eq__(self, other):
return NotImplemented
def __ne__(self, other):
return NotImplemented
def __array__(self, dtype=None, copy=None):
if dtype is None:
dtype = object
if dtype == object:
# on py38 builds it looks like numpy is inferring to a non-1D array
return construct_1d_object_array_from_listlike(list(self))
return np.asarray(self.data, dtype=dtype)
@property
def nbytes(self) -> int:
return sys.getsizeof(self.data)
def isna(self):
return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
def take(self, indexer, allow_fill=False, fill_value=None):
# re-implement here, since NumPy has trouble setting
# sized objects like UserDicts into scalar slots of
# an ndarary.
indexer = np.asarray(indexer)
msg = (
"Index is out of bounds or cannot do a "
"non-empty take from an empty array."
)
if allow_fill:
if fill_value is None:
fill_value = self.dtype.na_value
# bounds check
if (indexer < -1).any():
raise ValueError
try:
output = [
self.data[loc] if loc != -1 else fill_value for loc in indexer
]
except IndexError as err:
raise IndexError(msg) from err
else:
try:
output = [self.data[loc] for loc in indexer]
except IndexError as err:
raise IndexError(msg) from err
return type(self)._from_sequence(output, dtype=self.dtype)
def copy(self):
return type(self)(self.data[:])
def astype(self, dtype, copy=True):
# NumPy has issues when all the dicts are the same length.
# np.array([UserDict(...), UserDict(...)]) fails,
# but np.array([{...}, {...}]) works, so cast.
from pandas.core.arrays.string_ import StringDtype
dtype = pandas_dtype(dtype)
# needed to add this check for the Series constructor
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
elif isinstance(dtype, StringDtype):
value = self.astype(str) # numpy doesn't like nested dicts
arr_cls = dtype.construct_array_type()
return arr_cls._from_sequence(value, dtype=dtype, copy=False)
elif not copy:
return np.asarray([dict(x) for x in self], dtype=dtype)
else:
return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
def unique(self):
# Parent method doesn't work since np.array will try to infer
# a 2-dim object.
return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
@classmethod
def _concat_same_type(cls, to_concat):
data = list(itertools.chain.from_iterable(x.data for x in to_concat))
return cls(data)
def _values_for_factorize(self):
frozen = self._values_for_argsort()
if len(frozen) == 0:
# factorize_array expects 1-d array, this is a len-0 2-d array.
frozen = frozen.ravel()
return frozen, ()
def _values_for_argsort(self):
# Bypass NumPy's shape inference to get a (N,) array of tuples.
frozen = [tuple(x.items()) for x in self]
return construct_1d_object_array_from_listlike(frozen)
def _pad_or_backfill(self, *, method, limit=None, copy=True):
# GH#56616 - test EA method without limit_area argument
return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
rng = np.random.default_rng(2)
return [
UserDict(
[
(rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
for _ in range(rng.integers(0, 10))
]
)
for _ in range(100)
]

View File

@ -0,0 +1,490 @@
import collections
import operator
import sys
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension import base
from pandas.tests.extension.json.array import (
JSONArray,
JSONDtype,
make_data,
)
# We intentionally don't run base.BaseSetitemTests because pandas'
# internals has trouble setting sequences of values into scalar positions.
unhashable = pytest.mark.xfail(reason="Unhashable")
@pytest.fixture
def dtype():
return JSONDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
data = make_data()
# Why the while loop? NumPy is unable to construct an ndarray from
# equal-length ndarrays. Many of our operations involve coercing the
# EA to an ndarray of objects. To avoid random test failures, we ensure
# that our data is coercible to an ndarray. Several tests deal with only
# the first two elements, so that's what we'll check.
while len(data[0]) == len(data[1]):
data = make_data()
return JSONArray(data)
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return JSONArray([{}, {"a": 10}])
@pytest.fixture
def data_for_sorting():
return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
@pytest.fixture
def data_missing_for_sorting():
return JSONArray([{"b": 1}, {}, {"a": 4}])
@pytest.fixture
def na_cmp():
return operator.eq
@pytest.fixture
def data_for_grouping():
return JSONArray(
[
{"b": 1},
{"b": 1},
{},
{},
{"a": 0, "c": 2},
{"a": 0, "c": 2},
{"b": 1},
{"c": 2},
]
)
class TestJSONArray(base.ExtensionTests):
@pytest.mark.xfail(
reason="comparison method not implemented for JSONArray (GH-37867)"
)
def test_contains(self, data):
# GH-37867
super().test_contains(data)
@pytest.mark.xfail(reason="not implemented constructor from dtype")
def test_from_dtype(self, data):
# construct from our dtype & string dtype
super().test_from_dtype(data)
@pytest.mark.xfail(reason="RecursionError, GH-33900")
def test_series_constructor_no_data_with_index(self, dtype, na_value):
# RecursionError: maximum recursion depth exceeded in comparison
rec_limit = sys.getrecursionlimit()
try:
# Limit to avoid stack overflow on Windows CI
sys.setrecursionlimit(100)
super().test_series_constructor_no_data_with_index(dtype, na_value)
finally:
sys.setrecursionlimit(rec_limit)
@pytest.mark.xfail(reason="RecursionError, GH-33900")
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
# RecursionError: maximum recursion depth exceeded in comparison
rec_limit = sys.getrecursionlimit()
try:
# Limit to avoid stack overflow on Windows CI
sys.setrecursionlimit(100)
super().test_series_constructor_scalar_na_with_index(dtype, na_value)
finally:
sys.setrecursionlimit(rec_limit)
@pytest.mark.xfail(reason="collection as scalar, GH-33901")
def test_series_constructor_scalar_with_index(self, data, dtype):
# TypeError: All values must be of type <class 'collections.abc.Mapping'>
rec_limit = sys.getrecursionlimit()
try:
# Limit to avoid stack overflow on Windows CI
sys.setrecursionlimit(100)
super().test_series_constructor_scalar_with_index(data, dtype)
finally:
sys.setrecursionlimit(rec_limit)
@pytest.mark.xfail(reason="Different definitions of NA")
def test_stack(self):
"""
The test does .astype(object).stack(future_stack=True). If we happen to have
any missing values in `data`, then we'll end up with different
rows since we consider `{}` NA, but `.astype(object)` doesn't.
"""
super().test_stack()
@pytest.mark.xfail(reason="dict for NA")
def test_unstack(self, data, index):
# The base test has NaN for the expected NA value.
# this matches otherwise
return super().test_unstack(data, index)
@pytest.mark.xfail(reason="Setting a dict as a scalar")
def test_fillna_series(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
super().test_fillna_series()
@pytest.mark.xfail(reason="Setting a dict as a scalar")
def test_fillna_frame(self):
"""We treat dictionaries as a mapping in fillna, not a scalar."""
super().test_fillna_frame()
@pytest.mark.parametrize(
"limit_area, input_ilocs, expected_ilocs",
[
("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
],
)
def test_ffill_limit_area(
self, data_missing, limit_area, input_ilocs, expected_ilocs
):
# GH#56616
msg = "JSONArray does not implement limit_area"
with pytest.raises(NotImplementedError, match=msg):
super().test_ffill_limit_area(
data_missing, limit_area, input_ilocs, expected_ilocs
)
@unhashable
def test_value_counts(self, all_data, dropna):
super().test_value_counts(all_data, dropna)
@unhashable
def test_value_counts_with_normalize(self, data):
super().test_value_counts_with_normalize(data)
@unhashable
def test_sort_values_frame(self):
# TODO (EA.factorize): see if _values_for_factorize allows this.
super().test_sort_values_frame()
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
super().test_sort_values(data_for_sorting, ascending, sort_by_key)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_missing(
self, data_missing_for_sorting, ascending, sort_by_key
):
super().test_sort_values_missing(
data_missing_for_sorting, ascending, sort_by_key
)
@pytest.mark.xfail(reason="combine for JSONArray not supported")
def test_combine_le(self, data_repeated):
super().test_combine_le(data_repeated)
@pytest.mark.xfail(
reason="combine for JSONArray not supported - "
"may pass depending on random data",
strict=False,
raises=AssertionError,
)
def test_combine_first(self, data):
super().test_combine_first(data)
@pytest.mark.xfail(reason="broadcasting error")
def test_where_series(self, data, na_value):
# Fails with
# *** ValueError: operands could not be broadcast together
# with shapes (4,) (4,) (0,)
super().test_where_series(data, na_value)
@pytest.mark.xfail(reason="Can't compare dicts.")
def test_searchsorted(self, data_for_sorting):
super().test_searchsorted(data_for_sorting)
@pytest.mark.xfail(reason="Can't compare dicts.")
def test_equals(self, data, na_value, as_series):
super().test_equals(data, na_value, as_series)
@pytest.mark.skip("fill-value is interpreted as a dict of values")
def test_fillna_copy_frame(self, data_missing):
super().test_fillna_copy_frame(data_missing)
def test_equals_same_data_different_object(
self, data, using_copy_on_write, request
):
if using_copy_on_write:
mark = pytest.mark.xfail(reason="Fails with CoW")
request.applymarker(mark)
super().test_equals_same_data_different_object(data)
@pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
def test_astype_str(self):
"""This currently fails in NumPy on np.array(self, dtype=str) with
*** ValueError: setting an array element with a sequence
"""
super().test_astype_str()
@unhashable
def test_groupby_extension_transform(self):
"""
This currently fails in Series.name.setter, since the
name must be hashable, but the value is a dictionary.
I think this is what we want, i.e. `.name` should be the original
values, and not the values for factorization.
"""
super().test_groupby_extension_transform()
@unhashable
def test_groupby_extension_apply(self):
"""
This fails in Index._do_unique_check with
> hash(val)
E TypeError: unhashable type: 'UserDict' with
I suspect that once we support Index[ExtensionArray],
we'll be able to dispatch unique.
"""
super().test_groupby_extension_apply()
@unhashable
def test_groupby_extension_agg(self):
"""
This fails when we get to tm.assert_series_equal when left.index
contains dictionaries, which are not hashable.
"""
super().test_groupby_extension_agg()
@unhashable
def test_groupby_extension_no_sort(self):
"""
This fails when we get to tm.assert_series_equal when left.index
contains dictionaries, which are not hashable.
"""
super().test_groupby_extension_no_sort()
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
if len(data[0]) != 1:
mark = pytest.mark.xfail(reason="raises in coercing to Series")
request.applymarker(mark)
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
def test_compare_array(self, data, comparison_op, request):
if comparison_op.__name__ in ["eq", "ne"]:
mark = pytest.mark.xfail(reason="Comparison methods not implemented")
request.applymarker(mark)
super().test_compare_array(data, comparison_op)
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
def test_setitem_loc_scalar_mixed(self, data):
super().test_setitem_loc_scalar_mixed(data)
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
def test_setitem_loc_scalar_multiple_homogoneous(self, data):
super().test_setitem_loc_scalar_multiple_homogoneous(data)
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
def test_setitem_iloc_scalar_mixed(self, data):
super().test_setitem_iloc_scalar_mixed(data)
@pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
super().test_setitem_iloc_scalar_multiple_homogoneous(data)
@pytest.mark.parametrize(
"mask",
[
np.array([True, True, True, False, False]),
pd.array([True, True, True, False, False], dtype="boolean"),
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
],
ids=["numpy-array", "boolean-array", "boolean-array-na"],
)
def test_setitem_mask(self, data, mask, box_in_series, request):
if box_in_series:
mark = pytest.mark.xfail(
reason="cannot set using a list-like indexer with a different length"
)
request.applymarker(mark)
elif not isinstance(mask, np.ndarray):
mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
request.applymarker(mark)
super().test_setitem_mask(data, mask, box_in_series)
def test_setitem_mask_raises(self, data, box_in_series, request):
if not box_in_series:
mark = pytest.mark.xfail(reason="Fails to raise")
request.applymarker(mark)
super().test_setitem_mask_raises(data, box_in_series)
@pytest.mark.xfail(
reason="cannot set using a list-like indexer with a different length"
)
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_setitem_integer_array(self, data, idx, box_in_series, request):
if box_in_series:
mark = pytest.mark.xfail(
reason="cannot set using a list-like indexer with a different length"
)
request.applymarker(mark)
super().test_setitem_integer_array(data, idx, box_in_series)
@pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
@pytest.mark.parametrize(
"idx, box_in_series",
[
([0, 1, 2, pd.NA], False),
pytest.param(
[0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
],
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
)
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
@pytest.mark.xfail(reason="Fails to raise")
def test_setitem_scalar_key_sequence_raise(self, data):
super().test_setitem_scalar_key_sequence_raise(data)
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
if "full_slice" in request.node.name:
mark = pytest.mark.xfail(reason="slice is not iterable")
request.applymarker(mark)
super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
@pytest.mark.xfail(reason="slice is not iterable")
def test_setitem_frame_2d_values(self, data):
super().test_setitem_frame_2d_values(data)
@pytest.mark.xfail(
reason="cannot set using a list-like indexer with a different length"
)
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter):
super().test_setitem_mask_broadcast(data, setter)
@pytest.mark.xfail(
reason="cannot set using a slice indexer with a different length"
)
def test_setitem_slice(self, data, box_in_series):
super().test_setitem_slice(data, box_in_series)
@pytest.mark.xfail(reason="slice object is not iterable")
def test_setitem_loc_iloc_slice(self, data):
super().test_setitem_loc_iloc_slice(data)
@pytest.mark.xfail(reason="slice object is not iterable")
def test_setitem_slice_mismatch_length_raises(self, data):
super().test_setitem_slice_mismatch_length_raises(data)
@pytest.mark.xfail(reason="slice object is not iterable")
def test_setitem_slice_array(self, data):
super().test_setitem_slice_array(data)
@pytest.mark.xfail(reason="Fail to raise")
def test_setitem_invalid(self, data, invalid_scalar):
super().test_setitem_invalid(data, invalid_scalar)
@pytest.mark.xfail(reason="only integer scalar arrays can be converted")
def test_setitem_2d_values(self, data):
super().test_setitem_2d_values(data)
@pytest.mark.xfail(reason="data type 'json' not understood")
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data, request):
super().test_EA_types(engine, data, request)
def custom_assert_series_equal(left, right, *args, **kwargs):
# NumPy doesn't handle an array of equal-length UserDicts.
# The default assert_series_equal eventually does a
# Series.values, which raises. We work around it by
# converting the UserDicts to dicts.
if left.dtype.name == "json":
assert left.dtype == right.dtype
left = pd.Series(
JSONArray(left.values.astype(object)), index=left.index, name=left.name
)
right = pd.Series(
JSONArray(right.values.astype(object)),
index=right.index,
name=right.name,
)
tm.assert_series_equal(left, right, *args, **kwargs)
def custom_assert_frame_equal(left, right, *args, **kwargs):
obj_type = kwargs.get("obj", "DataFrame")
tm.assert_index_equal(
left.columns,
right.columns,
exact=kwargs.get("check_column_type", "equiv"),
check_names=kwargs.get("check_names", True),
check_exact=kwargs.get("check_exact", False),
check_categorical=kwargs.get("check_categorical", True),
obj=f"{obj_type}.columns",
)
jsons = (left.dtypes == "json").index
for col in jsons:
custom_assert_series_equal(left[col], right[col], *args, **kwargs)
left = left.drop(columns=jsons)
right = right.drop(columns=jsons)
tm.assert_frame_equal(left, right, *args, **kwargs)
def test_custom_asserts():
# This would always trigger the KeyError from trying to put
# an array of equal-length UserDicts inside an ndarray.
data = JSONArray(
[
collections.UserDict({"a": 1}),
collections.UserDict({"b": 2}),
collections.UserDict({"c": 3}),
]
)
a = pd.Series(data)
custom_assert_series_equal(a, a)
custom_assert_frame_equal(a.to_frame(), a.to_frame())
b = pd.Series(data.take([0, 0, 1]))
msg = r"Series are different"
with pytest.raises(AssertionError, match=msg):
custom_assert_series_equal(a, b)
with pytest.raises(AssertionError, match=msg):
custom_assert_frame_equal(a.to_frame(), b.to_frame())

View File

@ -0,0 +1,7 @@
from pandas.tests.extension.list.array import (
ListArray,
ListDtype,
make_data,
)
__all__ = ["ListArray", "ListDtype", "make_data"]

View File

@ -0,0 +1,137 @@
"""
Test extension array for storing nested data in a pandas container.
The ListArray stores an ndarray of lists.
"""
from __future__ import annotations
import numbers
import string
from typing import TYPE_CHECKING
import numpy as np
from pandas.core.dtypes.base import ExtensionDtype
import pandas as pd
from pandas.api.types import (
is_object_dtype,
is_string_dtype,
)
from pandas.core.arrays import ExtensionArray
if TYPE_CHECKING:
from pandas._typing import type_t
class ListDtype(ExtensionDtype):
type = list
name = "list"
na_value = np.nan
@classmethod
def construct_array_type(cls) -> type_t[ListArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return ListArray
class ListArray(ExtensionArray):
dtype = ListDtype()
__array_priority__ = 1000
def __init__(self, values, dtype=None, copy=False) -> None:
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array as values")
for val in values:
if not isinstance(val, self.dtype.type) and not pd.isna(val):
raise TypeError("All values must be of type " + str(self.dtype.type))
self.data = values
@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
data = np.empty(len(scalars), dtype=object)
data[:] = scalars
return cls(data)
def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
return type(self)(self.data[item])
def __len__(self) -> int:
return len(self.data)
def isna(self):
return np.array(
[not isinstance(x, list) and np.isnan(x) for x in self.data], dtype=bool
)
def take(self, indexer, allow_fill=False, fill_value=None):
# re-implement here, since NumPy has trouble setting
# sized objects like UserDicts into scalar slots of
# an ndarary.
indexer = np.asarray(indexer)
msg = (
"Index is out of bounds or cannot do a "
"non-empty take from an empty array."
)
if allow_fill:
if fill_value is None:
fill_value = self.dtype.na_value
# bounds check
if (indexer < -1).any():
raise ValueError
try:
output = [
self.data[loc] if loc != -1 else fill_value for loc in indexer
]
except IndexError as err:
raise IndexError(msg) from err
else:
try:
output = [self.data[loc] for loc in indexer]
except IndexError as err:
raise IndexError(msg) from err
return self._from_sequence(output)
def copy(self):
return type(self)(self.data[:])
def astype(self, dtype, copy=True):
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# numpy has problems with astype(str) for nested elements
return np.array([str(x) for x in self.data], dtype=dtype)
elif not copy:
return np.asarray(self.data, dtype=dtype)
else:
return np.array(self.data, dtype=dtype, copy=copy)
@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
return cls(data)
def make_data():
# TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
rng = np.random.default_rng(2)
data = np.empty(100, dtype=object)
data[:] = [
[rng.choice(list(string.ascii_letters)) for _ in range(rng.integers(0, 10))]
for _ in range(100)
]
return data

View File

@ -0,0 +1,33 @@
import pytest
import pandas as pd
from pandas.tests.extension.list.array import (
ListArray,
ListDtype,
make_data,
)
@pytest.fixture
def dtype():
return ListDtype()
@pytest.fixture
def data():
"""Length-100 ListArray for semantics test."""
data = make_data()
while len(data[0]) == len(data[1]):
data = make_data()
return ListArray(data)
def test_to_csv(data):
# https://github.com/pandas-dev/pandas/issues/28840
# array with list-likes fail when doing astype(str) on the numpy array
# which was done in get_values_for_csv
df = pd.DataFrame({"a": data})
res = df.to_csv()
assert str(data[0]) in res

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,200 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import string
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
import pandas as pd
from pandas import Categorical
import pandas._testing as tm
from pandas.api.types import CategoricalDtype
from pandas.tests.extension import base
def make_data():
while True:
values = np.random.default_rng(2).choice(list(string.ascii_letters), size=100)
# ensure we meet the requirements
# 1. first two not null
# 2. first and second are different
if values[0] != values[1]:
break
return values
@pytest.fixture
def dtype():
return CategoricalDtype()
@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
return Categorical(make_data())
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return Categorical([np.nan, "A"])
@pytest.fixture
def data_for_sorting():
return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True)
@pytest.fixture
def data_missing_for_sorting():
return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True)
@pytest.fixture
def data_for_grouping():
return Categorical(["a", "a", None, None, "b", "b", "a", "c"])
class TestCategorical(base.ExtensionTests):
@pytest.mark.xfail(reason="Memory usage doesn't match")
def test_memory_usage(self, data):
# TODO: Is this deliberate?
super().test_memory_usage(data)
def test_contains(self, data, data_missing):
# GH-37867
# na value handling in Categorical.__contains__ is deprecated.
# See base.BaseInterFaceTests.test_contains for more details.
na_value = data.dtype.na_value
# ensure data without missing values
data = data[~data.isna()]
# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing
# check the presence of na_value
assert na_value in data_missing
assert na_value not in data
# Categoricals can contain other nan-likes than na_value
for na_value_obj in tm.NULL_OBJECTS:
if na_value_obj is na_value:
continue
assert na_value_obj not in data
# this section suffers from super method
if not using_pyarrow_string_dtype():
assert na_value_obj in data_missing
def test_empty(self, dtype):
cls = dtype.construct_array_type()
result = cls._empty((4,), dtype=dtype)
assert isinstance(result, cls)
# the dtype we passed is not initialized, so will not match the
# dtype on our result.
assert result.dtype == CategoricalDtype([])
@pytest.mark.skip(reason="Backwards compatibility")
def test_getitem_scalar(self, data):
# CategoricalDtype.type isn't "correct" since it should
# be a parent of the elements (object). But don't want
# to break things by changing.
super().test_getitem_scalar(data)
@pytest.mark.xfail(reason="Unobserved categories included")
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)
def test_combine_add(self, data_repeated):
# GH 20825
# When adding categoricals in combine, result is a string
orig_data1, orig_data2 = data_repeated(2)
s1 = pd.Series(orig_data1)
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 + x2)
expected = pd.Series(
[a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
)
tm.assert_series_equal(result, expected)
val = s1.iloc[0]
result = s1.combine(val, lambda x1, x2: x1 + x2)
expected = pd.Series([a + val for a in list(orig_data1)])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
tm.assert_extension_array_equal(result, data)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
# frame & scalar
op_name = all_arithmetic_operators
if op_name == "__rmod__":
request.applymarker(
pytest.mark.xfail(
reason="rmod never called when string is first argument"
)
)
super().test_arith_frame_with_scalar(data, op_name)
def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
op_name = all_arithmetic_operators
if op_name == "__rmod__":
request.applymarker(
pytest.mark.xfail(
reason="rmod never called when string is first argument"
)
)
super().test_arith_series_with_scalar(data, op_name)
def _compare_other(self, ser: pd.Series, data, op, other):
op_name = f"__{op.__name__}__"
if op_name not in ["__eq__", "__ne__"]:
msg = "Unordered Categoricals can only compare equality or not"
with pytest.raises(TypeError, match=msg):
op(data, other)
else:
return super()._compare_other(ser, data, op, other)
@pytest.mark.xfail(reason="Categorical overrides __repr__")
@pytest.mark.parametrize("size", ["big", "small"])
def test_array_repr(self, data, size):
super().test_array_repr(data, size)
@pytest.mark.xfail(reason="TBD")
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)
class Test2DCompat(base.NDArrayBacked2DTests):
def test_repr_2d(self, data):
# Categorical __repr__ doesn't include "Categorical", so we need
# to special-case
res = repr(data.reshape(1, -1))
assert res.count("\nCategories") == 1
res = repr(data.reshape(-1, 1))
assert res.count("\nCategories") == 1

View File

@ -0,0 +1,105 @@
import numpy as np
import pytest
from pandas.core.dtypes import dtypes
from pandas.core.dtypes.common import is_extension_array_dtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import ExtensionArray
class DummyDtype(dtypes.ExtensionDtype):
pass
class DummyArray(ExtensionArray):
def __init__(self, data) -> None:
self.data = data
def __array__(self, dtype=None, copy=None):
return self.data
@property
def dtype(self):
return DummyDtype()
def astype(self, dtype, copy=True):
# we don't support anything but a single dtype
if isinstance(dtype, DummyDtype):
if copy:
return type(self)(self.data)
return self
elif not copy:
return np.asarray(self, dtype=dtype)
else:
return np.array(self, dtype=dtype, copy=copy)
class TestExtensionArrayDtype:
@pytest.mark.parametrize(
"values",
[
pd.Categorical([]),
pd.Categorical([]).dtype,
pd.Series(pd.Categorical([])),
DummyDtype(),
DummyArray(np.array([1, 2])),
],
)
def test_is_extension_array_dtype(self, values):
assert is_extension_array_dtype(values)
@pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))])
def test_is_not_extension_array_dtype(self, values):
assert not is_extension_array_dtype(values)
def test_astype():
arr = DummyArray(np.array([1, 2, 3]))
expected = np.array([1, 2, 3], dtype=object)
result = arr.astype(object)
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_no_copy():
arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
result = arr.astype(arr.dtype, copy=False)
assert arr is result
result = arr.astype(arr.dtype)
assert arr is not result
@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()])
def test_is_extension_array_dtype(dtype):
assert isinstance(dtype, dtypes.ExtensionDtype)
assert is_extension_array_dtype(dtype)
class CapturingStringArray(pd.arrays.StringArray):
"""Extend StringArray to capture arguments to __getitem__"""
def __getitem__(self, item):
self.last_item_arg = item
return super().__getitem__(item)
def test_ellipsis_index():
# GH#42430 1D slices over extension types turn into N-dimensional slices
# over ExtensionArrays
df = pd.DataFrame(
{"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))}
)
_ = df.iloc[:1]
# String comparison because there's no native way to compare slices.
# Before the fix for GH#42430, last_item_arg would get set to the 2D slice
# (Ellipsis, slice(None, 1, None))
out = df["col1"].array.last_item_arg
assert str(out) == "slice(None, 1, None)"

View File

@ -0,0 +1,144 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
from pandas.tests.extension import base
@pytest.fixture(params=["US/Central"])
def dtype(request):
return DatetimeTZDtype(unit="ns", tz=request.param)
@pytest.fixture
def data(dtype):
data = DatetimeArray._from_sequence(
pd.date_range("2000", periods=100, tz=dtype.tz), dtype=dtype
)
return data
@pytest.fixture
def data_missing(dtype):
return DatetimeArray._from_sequence(
np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def data_for_sorting(dtype):
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
c = pd.Timestamp("2000-01-03")
return DatetimeArray._from_sequence(
np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def data_missing_for_sorting(dtype):
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
return DatetimeArray._from_sequence(
np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def data_for_grouping(dtype):
"""
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
a = pd.Timestamp("2000-01-01")
b = pd.Timestamp("2000-01-02")
c = pd.Timestamp("2000-01-03")
na = "NaT"
return DatetimeArray._from_sequence(
np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype
)
@pytest.fixture
def na_cmp():
def cmp(a, b):
return a is pd.NaT and a is b
return cmp
# ----------------------------------------------------------------------------
class TestDatetimeArray(base.ExtensionTests):
def _get_expected_exception(self, op_name, obj, other):
if op_name in ["__sub__", "__rsub__"]:
return None
return super()._get_expected_exception(op_name, obj, other)
def _supports_accumulation(self, ser, op_name: str) -> bool:
return op_name in ["cummin", "cummax"]
def _supports_reduction(self, obj, op_name: str) -> bool:
return op_name in ["min", "max", "median", "mean", "std", "any", "all"]
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
meth = all_boolean_reductions
msg = f"'{meth}' with datetime64 dtypes is deprecated and will raise in"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
def test_series_constructor(self, data):
# Series construction drops any .freq attr
data = data._with_freq(None)
super().test_series_constructor(data)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
tm.assert_extension_array_equal(result, data)
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
if op_name in ["median", "mean", "std"]:
alt = ser.astype("int64")
res_op = getattr(ser, op_name)
exp_op = getattr(alt, op_name)
result = res_op(skipna=skipna)
expected = exp_op(skipna=skipna)
if op_name in ["mean", "median"]:
# error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype"
# has no attribute "tz"
tz = ser.dtype.tz # type: ignore[union-attr]
expected = pd.Timestamp(expected, tz=tz)
else:
expected = pd.Timedelta(expected)
tm.assert_almost_equal(result, expected)
else:
return super().check_reduce(ser, op_name, skipna)
class Test2DCompat(base.NDArrayBacked2DTests):
pass

View File

@ -0,0 +1,26 @@
"""
Tests for behavior if an author does *not* implement EA methods.
"""
import numpy as np
import pytest
from pandas.core.arrays import ExtensionArray
class MyEA(ExtensionArray):
def __init__(self, values) -> None:
self._values = values
@pytest.fixture
def data():
arr = np.arange(10)
return MyEA(arr)
class TestExtensionArray:
def test_errors(self, data, all_arithmetic_operators):
# invalid ops
op_name = all_arithmetic_operators
with pytest.raises(AttributeError):
getattr(data, op_name)

View File

@ -0,0 +1,98 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import IntervalDtype
from pandas import Interval
from pandas.core.arrays import IntervalArray
from pandas.tests.extension import base
if TYPE_CHECKING:
import pandas as pd
def make_data():
N = 100
left_array = np.random.default_rng(2).uniform(size=N).cumsum()
right_array = left_array + np.random.default_rng(2).uniform(size=N)
return [Interval(left, right) for left, right in zip(left_array, right_array)]
@pytest.fixture
def dtype():
return IntervalDtype()
@pytest.fixture
def data():
"""Length-100 PeriodArray for semantics test."""
return IntervalArray(make_data())
@pytest.fixture
def data_missing():
"""Length 2 array with [NA, Valid]"""
return IntervalArray.from_tuples([None, (0, 1)])
@pytest.fixture
def data_for_twos():
pytest.skip("Interval is not a numeric dtype")
@pytest.fixture
def data_for_sorting():
return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)])
@pytest.fixture
def data_missing_for_sorting():
return IntervalArray.from_tuples([(1, 2), None, (0, 1)])
@pytest.fixture
def data_for_grouping():
a = (0, 1)
b = (1, 2)
c = (2, 3)
return IntervalArray.from_tuples([b, b, None, None, a, a, b, c])
class TestIntervalArray(base.ExtensionTests):
divmod_exc = TypeError
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return op_name in ["min", "max"]
@pytest.mark.xfail(
reason="Raises with incorrect message bc it disallows *all* listlikes "
"instead of just wrong-length listlikes"
)
def test_fillna_length_mismatch(self, data_missing):
super().test_fillna_length_mismatch(data_missing)
# TODO: either belongs in tests.arrays.interval or move into base tests.
def test_fillna_non_scalar_raises(data_missing):
msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(TypeError, match=msg):
data_missing.fillna([1, 1])

View File

@ -0,0 +1,417 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import warnings
import numpy as np
import pytest
from pandas.compat import (
IS64,
is_platform_windows,
)
from pandas.compat.numpy import np_version_gt2
from pandas.core.dtypes.common import (
is_float_dtype,
is_signed_integer_dtype,
is_unsigned_integer_dtype,
)
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
from pandas.core.arrays.integer import (
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
)
from pandas.tests.extension import base
is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64
pytestmark = [
pytest.mark.filterwarnings(
"ignore:invalid value encountered in divide:RuntimeWarning"
),
pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning"),
# overflow only relevant for Floating dtype cases cases
pytest.mark.filterwarnings("ignore:overflow encountered in reduce:RuntimeWarning"),
]
def make_data():
return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]
def make_float_data():
return (
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0]
)
def make_bool_data():
return [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False]
@pytest.fixture(
params=[
Int8Dtype,
Int16Dtype,
Int32Dtype,
Int64Dtype,
UInt8Dtype,
UInt16Dtype,
UInt32Dtype,
UInt64Dtype,
Float32Dtype,
Float64Dtype,
BooleanDtype,
]
)
def dtype(request):
return request.param()
@pytest.fixture
def data(dtype):
if dtype.kind == "f":
data = make_float_data()
elif dtype.kind == "b":
data = make_bool_data()
else:
data = make_data()
return pd.array(data, dtype=dtype)
@pytest.fixture
def data_for_twos(dtype):
if dtype.kind == "b":
return pd.array(np.ones(100), dtype=dtype)
return pd.array(np.ones(100) * 2, dtype=dtype)
@pytest.fixture
def data_missing(dtype):
if dtype.kind == "f":
return pd.array([pd.NA, 0.1], dtype=dtype)
elif dtype.kind == "b":
return pd.array([np.nan, True], dtype=dtype)
return pd.array([pd.NA, 1], dtype=dtype)
@pytest.fixture
def data_for_sorting(dtype):
if dtype.kind == "f":
return pd.array([0.1, 0.2, 0.0], dtype=dtype)
elif dtype.kind == "b":
return pd.array([True, True, False], dtype=dtype)
return pd.array([1, 2, 0], dtype=dtype)
@pytest.fixture
def data_missing_for_sorting(dtype):
if dtype.kind == "f":
return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
elif dtype.kind == "b":
return pd.array([True, np.nan, False], dtype=dtype)
return pd.array([1, pd.NA, 0], dtype=dtype)
@pytest.fixture
def na_cmp():
# we are pd.NA
return lambda x, y: x is pd.NA and y is pd.NA
@pytest.fixture
def data_for_grouping(dtype):
if dtype.kind == "f":
b = 0.1
a = 0.0
c = 0.2
elif dtype.kind == "b":
b = True
a = False
c = b
else:
b = 1
a = 0
c = 2
na = pd.NA
return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
class TestMaskedArrays(base.ExtensionTests):
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data_missing, na_action):
result = data_missing.map(lambda x: x, na_action=na_action)
if data_missing.dtype == Float32Dtype():
# map roundtrips through objects, which converts to float64
expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
else:
expected = data_missing.to_numpy()
tm.assert_numpy_array_equal(result, expected)
def test_map_na_action_ignore(self, data_missing_for_sorting):
zero = data_missing_for_sorting[2]
result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore")
if data_missing_for_sorting.dtype.kind == "b":
expected = np.array([False, pd.NA, False], dtype=object)
else:
expected = np.array([zero, np.nan, zero])
tm.assert_numpy_array_equal(result, expected)
def _get_expected_exception(self, op_name, obj, other):
try:
dtype = tm.get_dtype(obj)
except AttributeError:
# passed arguments reversed
dtype = tm.get_dtype(other)
if dtype.kind == "b":
if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
# match behavior with non-masked bool dtype
return NotImplementedError
elif op_name in ["__sub__", "__rsub__"]:
# exception message would include "numpy boolean subtract""
return TypeError
return None
return None
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
sdtype = tm.get_dtype(obj)
expected = pointwise_result
if op_name in ("eq", "ne", "le", "ge", "lt", "gt"):
return expected.astype("boolean")
if sdtype.kind in "iu":
if op_name in ("__rtruediv__", "__truediv__", "__div__"):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Downcasting object dtype arrays",
category=FutureWarning,
)
filled = expected.fillna(np.nan)
expected = filled.astype("Float64")
else:
# combine method result in 'biggest' (int64) dtype
expected = expected.astype(sdtype)
elif sdtype.kind == "b":
if op_name in (
"__floordiv__",
"__rfloordiv__",
"__pow__",
"__rpow__",
"__mod__",
"__rmod__",
):
# combine keeps boolean type
expected = expected.astype("Int8")
elif op_name in ("__truediv__", "__rtruediv__"):
# combine with bools does not generate the correct result
# (numpy behaviour for div is to regard the bools as numeric)
op = self.get_op_from_name(op_name)
expected = self._combine(obj.astype(float), other, op)
expected = expected.astype("Float64")
if op_name == "__rpow__":
# for rpow, combine does not propagate NaN
result = getattr(obj, op_name)(other)
expected[result.isna()] = np.nan
else:
# combine method result in 'biggest' (float64) dtype
expected = expected.astype(sdtype)
return expected
def test_divmod_series_array(self, data, data_for_twos, request):
if data.dtype.kind == "b":
mark = pytest.mark.xfail(
reason="Inconsistency between floordiv and divmod; we raise for "
"floordiv but not for divmod. This matches what we do for "
"non-masked bool dtype."
)
request.applymarker(mark)
super().test_divmod_series_array(data, data_for_twos)
def test_combine_le(self, data_repeated):
# TODO: patching self is a bad pattern here
orig_data1, orig_data2 = data_repeated(2)
if orig_data1.dtype.kind == "b":
self._combine_le_expected_dtype = "boolean"
else:
# TODO: can we make this boolean?
self._combine_le_expected_dtype = object
super().test_combine_le(data_repeated)
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
if op_name in ["any", "all"] and ser.dtype.kind != "b":
pytest.skip(reason="Tested in tests/reductions/test_reductions.py")
return True
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
cmp_dtype = "int64"
if ser.dtype.kind == "f":
# Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has
# no attribute "numpy_dtype"
cmp_dtype = ser.dtype.numpy_dtype # type: ignore[union-attr]
elif ser.dtype.kind == "b":
if op_name in ["min", "max"]:
cmp_dtype = "bool"
# TODO: prod with integer dtypes does *not* match the result we would
# get if we used object for cmp_dtype. In that cae the object result
# is a large integer while the non-object case overflows and returns 0
alt = ser.dropna().astype(cmp_dtype)
if op_name == "count":
result = getattr(ser, op_name)()
expected = getattr(alt, op_name)()
else:
result = getattr(ser, op_name)(skipna=skipna)
expected = getattr(alt, op_name)(skipna=skipna)
if not skipna and ser.isna().any() and op_name not in ["any", "all"]:
expected = pd.NA
tm.assert_almost_equal(result, expected)
def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
if is_float_dtype(arr.dtype):
cmp_dtype = arr.dtype.name
elif op_name in ["mean", "median", "var", "std", "skew"]:
cmp_dtype = "Float64"
elif op_name in ["max", "min"]:
cmp_dtype = arr.dtype.name
elif arr.dtype in ["Int64", "UInt64"]:
cmp_dtype = arr.dtype.name
elif is_signed_integer_dtype(arr.dtype):
# TODO: Why does Window Numpy 2.0 dtype depend on skipna?
cmp_dtype = (
"Int32"
if (is_platform_windows() and (not np_version_gt2 or not skipna))
or not IS64
else "Int64"
)
elif is_unsigned_integer_dtype(arr.dtype):
cmp_dtype = (
"UInt32"
if (is_platform_windows() and (not np_version_gt2 or not skipna))
or not IS64
else "UInt64"
)
elif arr.dtype.kind == "b":
if op_name in ["mean", "median", "var", "std", "skew"]:
cmp_dtype = "Float64"
elif op_name in ["min", "max"]:
cmp_dtype = "boolean"
elif op_name in ["sum", "prod"]:
cmp_dtype = (
"Int32"
if (is_platform_windows() and (not np_version_gt2 or not skipna))
or not IS64
else "Int64"
)
else:
raise TypeError("not supposed to reach this")
else:
raise TypeError("not supposed to reach this")
return cmp_dtype
def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
return True
def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
# overwrite to ensure pd.NA is tested instead of np.nan
# https://github.com/pandas-dev/pandas/issues/30958
length = 64
if is_windows_or_32bit:
# Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has
# no attribute "itemsize"
if not ser.dtype.itemsize == 8: # type: ignore[union-attr]
length = 32
if ser.dtype.name.startswith("U"):
expected_dtype = f"UInt{length}"
elif ser.dtype.name.startswith("I"):
expected_dtype = f"Int{length}"
elif ser.dtype.name.startswith("F"):
# Incompatible types in assignment (expression has type
# "Union[dtype[Any], ExtensionDtype]", variable has type "str")
expected_dtype = ser.dtype # type: ignore[assignment]
elif ser.dtype.kind == "b":
if op_name in ("cummin", "cummax"):
expected_dtype = "boolean"
else:
expected_dtype = f"Int{length}"
if expected_dtype == "Float32" and op_name == "cumprod" and skipna:
# TODO: xfail?
pytest.skip(
f"Float32 precision lead to large differences with op {op_name} "
f"and skipna={skipna}"
)
if op_name == "cumsum":
result = getattr(ser, op_name)(skipna=skipna)
expected = pd.Series(
pd.array(
getattr(ser.astype("float64"), op_name)(skipna=skipna),
dtype=expected_dtype,
)
)
tm.assert_series_equal(result, expected)
elif op_name in ["cummax", "cummin"]:
result = getattr(ser, op_name)(skipna=skipna)
expected = pd.Series(
pd.array(
getattr(ser.astype("float64"), op_name)(skipna=skipna),
dtype=ser.dtype,
)
)
tm.assert_series_equal(result, expected)
elif op_name == "cumprod":
result = getattr(ser[:12], op_name)(skipna=skipna)
expected = pd.Series(
pd.array(
getattr(ser[:12].astype("float64"), op_name)(skipna=skipna),
dtype=expected_dtype,
)
)
tm.assert_series_equal(result, expected)
else:
raise NotImplementedError(f"{op_name} not supported")
class Test2DCompat(base.Dim2CompatTests):
pass

View File

@ -0,0 +1,426 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
Note: we do not bother with base.BaseIndexTests because NumpyExtensionArray
will never be held in an Index.
"""
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import NumpyEADtype
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_object_dtype
from pandas.core.arrays.numpy_ import NumpyExtensionArray
from pandas.tests.extension import base
orig_assert_attr_equal = tm.assert_attr_equal
def _assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
"""
patch tm.assert_attr_equal so NumpyEADtype("object") is closed enough to
np.dtype("object")
"""
if attr == "dtype":
lattr = getattr(left, "dtype", None)
rattr = getattr(right, "dtype", None)
if isinstance(lattr, NumpyEADtype) and not isinstance(rattr, NumpyEADtype):
left = left.astype(lattr.numpy_dtype)
elif isinstance(rattr, NumpyEADtype) and not isinstance(lattr, NumpyEADtype):
right = right.astype(rattr.numpy_dtype)
orig_assert_attr_equal(attr, left, right, obj)
@pytest.fixture(params=["float", "object"])
def dtype(request):
return NumpyEADtype(np.dtype(request.param))
@pytest.fixture
def allow_in_pandas(monkeypatch):
"""
A monkeypatch to tells pandas to let us in.
By default, passing a NumpyExtensionArray to an index / series / frame
constructor will unbox that NumpyExtensionArray to an ndarray, and treat
it as a non-EA column. We don't want people using EAs without
reason.
The mechanism for this is a check against ABCNumpyExtensionArray
in each constructor.
But, for testing, we need to allow them in pandas. So we patch
the _typ of NumpyExtensionArray, so that we evade the ABCNumpyExtensionArray
check.
"""
with monkeypatch.context() as m:
m.setattr(NumpyExtensionArray, "_typ", "extension")
m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal)
yield
@pytest.fixture
def data(allow_in_pandas, dtype):
if dtype.numpy_dtype == "object":
return pd.Series([(i,) for i in range(100)]).array
return NumpyExtensionArray(np.arange(1, 101, dtype=dtype._dtype))
@pytest.fixture
def data_missing(allow_in_pandas, dtype):
if dtype.numpy_dtype == "object":
return NumpyExtensionArray(np.array([np.nan, (1,)], dtype=object))
return NumpyExtensionArray(np.array([np.nan, 1.0]))
@pytest.fixture
def na_cmp():
def cmp(a, b):
return np.isnan(a) and np.isnan(b)
return cmp
@pytest.fixture
def data_for_sorting(allow_in_pandas, dtype):
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
if dtype.numpy_dtype == "object":
# Use an empty tuple for first element, then remove,
# to disable np.array's shape inference.
return NumpyExtensionArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:])
return NumpyExtensionArray(np.array([1, 2, 0]))
@pytest.fixture
def data_missing_for_sorting(allow_in_pandas, dtype):
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
if dtype.numpy_dtype == "object":
return NumpyExtensionArray(np.array([(1,), np.nan, (0,)], dtype=object))
return NumpyExtensionArray(np.array([1, np.nan, 0]))
@pytest.fixture
def data_for_grouping(allow_in_pandas, dtype):
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
if dtype.numpy_dtype == "object":
a, b, c = (1,), (2,), (3,)
else:
a, b, c = np.arange(3)
return NumpyExtensionArray(
np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype)
)
@pytest.fixture
def data_for_twos(dtype):
if dtype.kind == "O":
pytest.skip(f"{dtype} is not a numeric dtype")
arr = np.ones(100) * 2
return NumpyExtensionArray._from_sequence(arr, dtype=dtype)
@pytest.fixture
def skip_numpy_object(dtype, request):
"""
Tests for NumpyExtensionArray with nested data. Users typically won't create
these objects via `pd.array`, but they can show up through `.array`
on a Series with nested data. Many of the base tests fail, as they aren't
appropriate for nested data.
This fixture allows these tests to be skipped when used as a usefixtures
marker to either an individual test or a test class.
"""
if dtype == "object":
mark = pytest.mark.xfail(reason="Fails for object dtype")
request.applymarker(mark)
skip_nested = pytest.mark.usefixtures("skip_numpy_object")
class TestNumpyExtensionArray(base.ExtensionTests):
@pytest.mark.skip(reason="We don't register our dtype")
# We don't want to register. This test should probably be split in two.
def test_from_dtype(self, data):
pass
@skip_nested
def test_series_constructor_scalar_with_index(self, data, dtype):
# ValueError: Length of passed values is 1, index implies 3.
super().test_series_constructor_scalar_with_index(data, dtype)
def test_check_dtype(self, data, request, using_infer_string):
if data.dtype.numpy_dtype == "object":
request.applymarker(
pytest.mark.xfail(
reason=f"NumpyExtensionArray expectedly clashes with a "
f"NumPy name: {data.dtype.numpy_dtype}"
)
)
super().test_check_dtype(data)
def test_is_not_object_type(self, dtype, request):
if dtype.numpy_dtype == "object":
# Different from BaseDtypeTests.test_is_not_object_type
# because NumpyEADtype(object) is an object type
assert is_object_dtype(dtype)
else:
super().test_is_not_object_type(dtype)
@skip_nested
def test_getitem_scalar(self, data):
# AssertionError
super().test_getitem_scalar(data)
@skip_nested
def test_shift_fill_value(self, data):
# np.array shape inference. Shift implementation fails.
super().test_shift_fill_value(data)
@skip_nested
def test_fillna_copy_frame(self, data_missing):
# The "scalar" for this array isn't a scalar.
super().test_fillna_copy_frame(data_missing)
@skip_nested
def test_fillna_copy_series(self, data_missing):
# The "scalar" for this array isn't a scalar.
super().test_fillna_copy_series(data_missing)
@skip_nested
def test_searchsorted(self, data_for_sorting, as_series):
# TODO: NumpyExtensionArray.searchsorted calls ndarray.searchsorted which
# isn't quite what we want in nested data cases. Instead we need to
# adapt something like libindex._bin_search.
super().test_searchsorted(data_for_sorting, as_series)
@pytest.mark.xfail(reason="NumpyExtensionArray.diff may fail on dtype")
def test_diff(self, data, periods):
return super().test_diff(data, periods)
def test_insert(self, data, request):
if data.dtype.numpy_dtype == object:
mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate")
request.applymarker(mark)
super().test_insert(data)
@skip_nested
def test_insert_invalid(self, data, invalid_scalar):
# NumpyExtensionArray[object] can hold anything, so skip
super().test_insert_invalid(data, invalid_scalar)
divmod_exc = None
series_scalar_exc = None
frame_scalar_exc = None
series_array_exc = None
def test_divmod(self, data):
divmod_exc = None
if data.dtype.kind == "O":
divmod_exc = TypeError
self.divmod_exc = divmod_exc
super().test_divmod(data)
def test_divmod_series_array(self, data):
ser = pd.Series(data)
exc = None
if data.dtype.kind == "O":
exc = TypeError
self.divmod_exc = exc
self._check_divmod_op(ser, divmod, data)
def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
opname = all_arithmetic_operators
series_scalar_exc = None
if data.dtype.numpy_dtype == object:
if opname in ["__mul__", "__rmul__"]:
mark = pytest.mark.xfail(
reason="the Series.combine step raises but not the Series method."
)
request.node.add_marker(mark)
series_scalar_exc = TypeError
self.series_scalar_exc = series_scalar_exc
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
opname = all_arithmetic_operators
series_array_exc = None
if data.dtype.numpy_dtype == object and opname not in ["__add__", "__radd__"]:
series_array_exc = TypeError
self.series_array_exc = series_array_exc
super().test_arith_series_with_array(data, all_arithmetic_operators)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
opname = all_arithmetic_operators
frame_scalar_exc = None
if data.dtype.numpy_dtype == object:
if opname in ["__mul__", "__rmul__"]:
mark = pytest.mark.xfail(
reason="the Series.combine step raises but not the Series method."
)
request.node.add_marker(mark)
frame_scalar_exc = TypeError
self.frame_scalar_exc = frame_scalar_exc
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
if ser.dtype.kind == "O":
return op_name in ["sum", "min", "max", "any", "all"]
return True
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
res_op = getattr(ser, op_name)
# avoid coercing int -> float. Just cast to the actual numpy type.
# error: Item "ExtensionDtype" of "dtype[Any] | ExtensionDtype" has
# no attribute "numpy_dtype"
cmp_dtype = ser.dtype.numpy_dtype # type: ignore[union-attr]
alt = ser.astype(cmp_dtype)
exp_op = getattr(alt, op_name)
if op_name == "count":
result = res_op()
expected = exp_op()
else:
result = res_op(skipna=skipna)
expected = exp_op(skipna=skipna)
tm.assert_almost_equal(result, expected)
@pytest.mark.skip("TODO: tests not written yet")
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_frame(self, data, all_numeric_reductions, skipna):
pass
@skip_nested
def test_fillna_series(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_series(data_missing)
@skip_nested
def test_fillna_frame(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_frame(data_missing)
@skip_nested
def test_setitem_invalid(self, data, invalid_scalar):
# object dtype can hold anything, so doesn't raise
super().test_setitem_invalid(data, invalid_scalar)
@skip_nested
def test_setitem_sequence_broadcasts(self, data, box_in_series):
# ValueError: cannot set using a list-like indexer with a different
# length than the value
super().test_setitem_sequence_broadcasts(data, box_in_series)
@skip_nested
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter):
# ValueError: cannot set using a list-like indexer with a different
# length than the value
super().test_setitem_mask_broadcast(data, setter)
@skip_nested
def test_setitem_scalar_key_sequence_raise(self, data):
# Failed: DID NOT RAISE <class 'ValueError'>
super().test_setitem_scalar_key_sequence_raise(data)
# TODO: there is some issue with NumpyExtensionArray, therefore,
# skip the setitem test for now, and fix it later (GH 31446)
@skip_nested
@pytest.mark.parametrize(
"mask",
[
np.array([True, True, True, False, False]),
pd.array([True, True, True, False, False], dtype="boolean"),
],
ids=["numpy-array", "boolean-array"],
)
def test_setitem_mask(self, data, mask, box_in_series):
super().test_setitem_mask(data, mask, box_in_series)
@skip_nested
@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_setitem_integer_array(self, data, idx, box_in_series):
super().test_setitem_integer_array(data, idx, box_in_series)
@pytest.mark.parametrize(
"idx, box_in_series",
[
([0, 1, 2, pd.NA], False),
pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
(pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
],
ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
)
def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
@skip_nested
def test_setitem_slice(self, data, box_in_series):
super().test_setitem_slice(data, box_in_series)
@skip_nested
def test_setitem_loc_iloc_slice(self, data):
super().test_setitem_loc_iloc_slice(data)
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
# https://github.com/pandas-dev/pandas/issues/32395
df = expected = pd.DataFrame({"data": pd.Series(data)})
result = pd.DataFrame(index=df.index)
# because result has object dtype, the attempt to do setting inplace
# is successful, and object dtype is retained
key = full_indexer(df)
result.loc[key, "data"] = df["data"]
# base class method has expected = df; NumpyExtensionArray behaves oddly because
# we patch _typ for these tests.
if data.dtype.numpy_dtype != object:
if not isinstance(key, slice) or key != slice(None):
expected = pd.DataFrame({"data": data.to_numpy()})
tm.assert_frame_equal(result, expected, check_column_type=False)
@pytest.mark.xfail(reason="NumpyEADtype is unpacked")
def test_index_from_listlike_with_dtype(self, data):
super().test_index_from_listlike_with_dtype(data)
@skip_nested
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data, request):
super().test_EA_types(engine, data, request)
class Test2DCompat(base.NDArrayBacked2DTests):
pass

View File

@ -0,0 +1,119 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
import numpy as np
import pytest
from pandas._libs import (
Period,
iNaT,
)
from pandas.compat import is_platform_windows
from pandas.compat.numpy import np_version_gte1p24
from pandas.core.dtypes.dtypes import PeriodDtype
import pandas._testing as tm
from pandas.core.arrays import PeriodArray
from pandas.tests.extension import base
if TYPE_CHECKING:
import pandas as pd
@pytest.fixture(params=["D", "2D"])
def dtype(request):
return PeriodDtype(freq=request.param)
@pytest.fixture
def data(dtype):
return PeriodArray(np.arange(1970, 2070), dtype=dtype)
@pytest.fixture
def data_for_sorting(dtype):
return PeriodArray([2018, 2019, 2017], dtype=dtype)
@pytest.fixture
def data_missing(dtype):
return PeriodArray([iNaT, 2017], dtype=dtype)
@pytest.fixture
def data_missing_for_sorting(dtype):
return PeriodArray([2018, iNaT, 2017], dtype=dtype)
@pytest.fixture
def data_for_grouping(dtype):
B = 2018
NA = iNaT
A = 2017
C = 2019
return PeriodArray([B, B, NA, NA, A, A, B, C], dtype=dtype)
class TestPeriodArray(base.ExtensionTests):
def _get_expected_exception(self, op_name, obj, other):
if op_name in ("__sub__", "__rsub__"):
return None
return super()._get_expected_exception(op_name, obj, other)
def _supports_accumulation(self, ser, op_name: str) -> bool:
return op_name in ["cummin", "cummax"]
def _supports_reduction(self, obj, op_name: str) -> bool:
return op_name in ["min", "max", "median"]
def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
if op_name == "median":
res_op = getattr(ser, op_name)
alt = ser.astype("int64")
exp_op = getattr(alt, op_name)
result = res_op(skipna=skipna)
expected = exp_op(skipna=skipna)
# error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
# attribute "freq"
freq = ser.dtype.freq # type: ignore[union-attr]
expected = Period._from_ordinal(int(expected), freq=freq)
tm.assert_almost_equal(result, expected)
else:
return super().check_reduce(ser, op_name, skipna)
@pytest.mark.parametrize("periods", [1, -2])
def test_diff(self, data, periods):
if is_platform_windows() and np_version_gte1p24:
with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False):
super().test_diff(data, periods)
else:
super().test_diff(data, periods)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
tm.assert_extension_array_equal(result, data)
class Test2DCompat(base.NDArrayBacked2DTests):
pass

View File

@ -0,0 +1,503 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
import numpy as np
import pytest
from pandas.errors import PerformanceWarning
import pandas as pd
from pandas import SparseDtype
import pandas._testing as tm
from pandas.arrays import SparseArray
from pandas.tests.extension import base
def make_data(fill_value):
rng = np.random.default_rng(2)
if np.isnan(fill_value):
data = rng.uniform(size=100)
else:
data = rng.integers(1, 100, size=100, dtype=int)
if data[0] == data[1]:
data[0] += 1
data[2::3] = fill_value
return data
@pytest.fixture
def dtype():
return SparseDtype()
@pytest.fixture(params=[0, np.nan])
def data(request):
"""Length-100 PeriodArray for semantics test."""
res = SparseArray(make_data(request.param), fill_value=request.param)
return res
@pytest.fixture
def data_for_twos():
return SparseArray(np.ones(100) * 2)
@pytest.fixture(params=[0, np.nan])
def data_missing(request):
"""Length 2 array with [NA, Valid]"""
return SparseArray([np.nan, 1], fill_value=request.param)
@pytest.fixture(params=[0, np.nan])
def data_repeated(request):
"""Return different versions of data for count times"""
def gen(count):
for _ in range(count):
yield SparseArray(make_data(request.param), fill_value=request.param)
yield gen
@pytest.fixture(params=[0, np.nan])
def data_for_sorting(request):
return SparseArray([2, 3, 1], fill_value=request.param)
@pytest.fixture(params=[0, np.nan])
def data_missing_for_sorting(request):
return SparseArray([2, np.nan, 1], fill_value=request.param)
@pytest.fixture
def na_cmp():
return lambda left, right: pd.isna(left) and pd.isna(right)
@pytest.fixture(params=[0, np.nan])
def data_for_grouping(request):
return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param)
@pytest.fixture(params=[0, np.nan])
def data_for_compare(request):
return SparseArray([0, 0, np.nan, -2, -1, 4, 2, 3, 0, 0], fill_value=request.param)
class TestSparseArray(base.ExtensionTests):
def _supports_reduction(self, obj, op_name: str) -> bool:
return True
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
if all_numeric_reductions in [
"prod",
"median",
"var",
"std",
"sem",
"skew",
"kurt",
]:
mark = pytest.mark.xfail(
reason="This should be viable but is not implemented"
)
request.node.add_marker(mark)
elif (
all_numeric_reductions in ["sum", "max", "min", "mean"]
and data.dtype.kind == "f"
and not skipna
):
mark = pytest.mark.xfail(reason="getting a non-nan float")
request.node.add_marker(mark)
super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
if all_numeric_reductions in [
"prod",
"median",
"var",
"std",
"sem",
"skew",
"kurt",
]:
mark = pytest.mark.xfail(
reason="This should be viable but is not implemented"
)
request.node.add_marker(mark)
elif (
all_numeric_reductions in ["sum", "max", "min", "mean"]
and data.dtype.kind == "f"
and not skipna
):
mark = pytest.mark.xfail(reason="ExtensionArray NA mask are different")
request.node.add_marker(mark)
super().test_reduce_frame(data, all_numeric_reductions, skipna)
def _check_unsupported(self, data):
if data.dtype == SparseDtype(int, 0):
pytest.skip("Can't store nan in int array.")
def test_concat_mixed_dtypes(self, data):
# https://github.com/pandas-dev/pandas/issues/20762
# This should be the same, aside from concat([sparse, float])
df1 = pd.DataFrame({"A": data[:3]})
df2 = pd.DataFrame({"A": [1, 2, 3]})
df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
dfs = [df1, df2, df3]
# dataframes
result = pd.concat(dfs)
expected = pd.concat(
[x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:The previous implementation of stack is deprecated"
)
@pytest.mark.parametrize(
"columns",
[
["A", "B"],
pd.MultiIndex.from_tuples(
[("A", "a"), ("A", "b")], names=["outer", "inner"]
),
],
)
@pytest.mark.parametrize("future_stack", [True, False])
def test_stack(self, data, columns, future_stack):
super().test_stack(data, columns, future_stack)
def test_concat_columns(self, data, na_value):
self._check_unsupported(data)
super().test_concat_columns(data, na_value)
def test_concat_extension_arrays_copy_false(self, data, na_value):
self._check_unsupported(data)
super().test_concat_extension_arrays_copy_false(data, na_value)
def test_align(self, data, na_value):
self._check_unsupported(data)
super().test_align(data, na_value)
def test_align_frame(self, data, na_value):
self._check_unsupported(data)
super().test_align_frame(data, na_value)
def test_align_series_frame(self, data, na_value):
self._check_unsupported(data)
super().test_align_series_frame(data, na_value)
def test_merge(self, data, na_value):
self._check_unsupported(data)
super().test_merge(data, na_value)
def test_get(self, data):
ser = pd.Series(data, index=[2 * i for i in range(len(data))])
if np.isnan(ser.values.fill_value):
assert np.isnan(ser.get(4)) and np.isnan(ser.iloc[2])
else:
assert ser.get(4) == ser.iloc[2]
assert ser.get(2) == ser.iloc[1]
def test_reindex(self, data, na_value):
self._check_unsupported(data)
super().test_reindex(data, na_value)
def test_isna(self, data_missing):
sarr = SparseArray(data_missing)
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([True, False], dtype=expected_dtype)
result = sarr.isna()
tm.assert_sp_array_equal(result, expected)
# test isna for arr without na
sarr = sarr.fillna(0)
expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
tm.assert_equal(sarr.isna(), expected)
def test_fillna_limit_backfill(self, data_missing):
warns = (PerformanceWarning, FutureWarning)
with tm.assert_produces_warning(warns, check_stacklevel=False):
super().test_fillna_limit_backfill(data_missing)
def test_fillna_no_op_returns_copy(self, data, request):
if np.isnan(data.fill_value):
request.applymarker(
pytest.mark.xfail(reason="returns array with different fill value")
)
super().test_fillna_no_op_returns_copy(data)
@pytest.mark.xfail(reason="Unsupported")
def test_fillna_series(self, data_missing):
# this one looks doable.
# TODO: this fails bc we do not pass through data_missing. If we did,
# the 0-fill case would xpass
super().test_fillna_series()
def test_fillna_frame(self, data_missing):
# Have to override to specify that fill_value will change.
fill_value = data_missing[1]
result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
if pd.isna(data_missing.fill_value):
dtype = SparseDtype(data_missing.dtype, fill_value)
else:
dtype = data_missing.dtype
expected = pd.DataFrame(
{
"A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype),
"B": [1, 2],
}
)
tm.assert_frame_equal(result, expected)
_combine_le_expected_dtype = "Sparse[bool]"
def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr}, copy=False)
filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)
if hasattr(df._mgr, "blocks"):
if using_copy_on_write:
assert df.values.base is result.values.base
else:
assert df.values.base is not result.values.base
assert df.A._values.to_dense() is arr.to_dense()
def test_fillna_copy_series(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
ser = pd.Series(arr, copy=False)
filled_val = ser[0]
result = ser.fillna(filled_val)
if using_copy_on_write:
assert ser._values is result._values
else:
assert ser._values is not result._values
assert ser._values.to_dense() is arr.to_dense()
@pytest.mark.xfail(reason="Not Applicable")
def test_fillna_length_mismatch(self, data_missing):
super().test_fillna_length_mismatch(data_missing)
def test_where_series(self, data, na_value):
assert data[0] != data[1]
cls = type(data)
a, b = data[:2]
ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
cond = np.array([True, True, False, False])
result = ser.where(cond)
new_dtype = SparseDtype("float", 0.0)
expected = pd.Series(
cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)
)
tm.assert_series_equal(result, expected)
other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
cond = np.array([True, False, True, True])
result = ser.where(cond, other)
expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
tm.assert_series_equal(result, expected)
def test_searchsorted(self, data_for_sorting, as_series):
with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False):
super().test_searchsorted(data_for_sorting, as_series)
def test_shift_0_periods(self, data):
# GH#33856 shifting with periods=0 should return a copy, not same obj
result = data.shift(0)
data._sparse_values[0] = data._sparse_values[1]
assert result._sparse_values[0] != result._sparse_values[1]
@pytest.mark.parametrize("method", ["argmax", "argmin"])
def test_argmin_argmax_all_na(self, method, data, na_value):
# overriding because Sparse[int64, 0] cannot handle na_value
self._check_unsupported(data)
super().test_argmin_argmax_all_na(method, data, na_value)
@pytest.mark.fails_arm_wheels
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
def test_equals(self, data, na_value, as_series, box):
self._check_unsupported(data)
super().test_equals(data, na_value, as_series, box)
@pytest.mark.fails_arm_wheels
def test_equals_same_data_different_object(self, data):
super().test_equals_same_data_different_object(data)
@pytest.mark.parametrize(
"func, na_action, expected",
[
(lambda x: x, None, SparseArray([1.0, np.nan])),
(lambda x: x, "ignore", SparseArray([1.0, np.nan])),
(str, None, SparseArray(["1.0", "nan"], fill_value="nan")),
(str, "ignore", SparseArray(["1.0", np.nan])),
],
)
def test_map(self, func, na_action, expected):
# GH52096
data = SparseArray([1, np.nan])
result = data.map(func, na_action=na_action)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map_raises(self, data, na_action):
# GH52096
msg = "fill value in the sparse values not supported"
with pytest.raises(ValueError, match=msg):
data.map(lambda x: np.nan, na_action=na_action)
@pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype")
def test_astype_string(self, data, nullable_string_dtype):
# TODO: this fails bc we do not pass through nullable_string_dtype;
# If we did, the 0-cases would xpass
super().test_astype_string(data)
series_scalar_exc = None
frame_scalar_exc = None
divmod_exc = None
series_array_exc = None
def _skip_if_different_combine(self, data):
if data.fill_value == 0:
# arith ops call on dtype.fill_value so that the sparsity
# is maintained. Combine can't be called on a dtype in
# general, so we can't make the expected. This is tested elsewhere
pytest.skip("Incorrected expected from Series.combine and tested elsewhere")
def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
self._skip_if_different_combine(data)
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(self, data, all_arithmetic_operators):
self._skip_if_different_combine(data)
super().test_arith_series_with_array(data, all_arithmetic_operators)
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
if data.dtype.fill_value != 0:
pass
elif all_arithmetic_operators.strip("_") not in [
"mul",
"rmul",
"floordiv",
"rfloordiv",
"pow",
"mod",
"rmod",
]:
mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
request.applymarker(mark)
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
def _compare_other(
self, ser: pd.Series, data_for_compare: SparseArray, comparison_op, other
):
op = comparison_op
result = op(data_for_compare, other)
if isinstance(other, pd.Series):
assert isinstance(result, pd.Series)
assert isinstance(result.dtype, SparseDtype)
else:
assert isinstance(result, SparseArray)
assert result.dtype.subtype == np.bool_
if isinstance(other, pd.Series):
fill_value = op(data_for_compare.fill_value, other._values.fill_value)
expected = SparseArray(
op(data_for_compare.to_dense(), np.asarray(other)),
fill_value=fill_value,
dtype=np.bool_,
)
else:
fill_value = np.all(
op(np.asarray(data_for_compare.fill_value), np.asarray(other))
)
expected = SparseArray(
op(data_for_compare.to_dense(), np.asarray(other)),
fill_value=fill_value,
dtype=np.bool_,
)
if isinstance(other, pd.Series):
# error: Incompatible types in assignment
expected = pd.Series(expected) # type: ignore[assignment]
tm.assert_equal(result, expected)
def test_scalar(self, data_for_compare: SparseArray, comparison_op):
ser = pd.Series(data_for_compare)
self._compare_other(ser, data_for_compare, comparison_op, 0)
self._compare_other(ser, data_for_compare, comparison_op, 1)
self._compare_other(ser, data_for_compare, comparison_op, -1)
self._compare_other(ser, data_for_compare, comparison_op, np.nan)
def test_array(self, data_for_compare: SparseArray, comparison_op, request):
if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ in [
"eq",
"ge",
"le",
]:
mark = pytest.mark.xfail(reason="Wrong fill_value")
request.applymarker(mark)
arr = np.linspace(-4, 5, 10)
ser = pd.Series(data_for_compare)
self._compare_other(ser, data_for_compare, comparison_op, arr)
def test_sparse_array(self, data_for_compare: SparseArray, comparison_op, request):
if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ != "gt":
mark = pytest.mark.xfail(reason="Wrong fill_value")
request.applymarker(mark)
ser = pd.Series(data_for_compare)
arr = data_for_compare + 1
self._compare_other(ser, data_for_compare, comparison_op, arr)
arr = data_for_compare * 2
self._compare_other(ser, data_for_compare, comparison_op, arr)
@pytest.mark.xfail(reason="Different repr")
def test_array_repr(self, data, size):
super().test_array_repr(data, size)
@pytest.mark.xfail(reason="result does not match expected")
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)
def test_array_type_with_arg(dtype):
assert dtype.construct_array_type() is SparseArray

View File

@ -0,0 +1,242 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
from __future__ import annotations
import string
from typing import cast
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.api.types import is_string_dtype
from pandas.core.arrays import ArrowStringArray
from pandas.core.arrays.string_ import StringDtype
from pandas.tests.extension import base
def maybe_split_array(arr, chunked):
if not chunked:
return arr
elif arr.dtype.storage != "pyarrow":
return arr
pa = pytest.importorskip("pyarrow")
arrow_array = arr._pa_array
split = len(arrow_array) // 2
arrow_array = pa.chunked_array(
[*arrow_array[:split].chunks, *arrow_array[split:].chunks]
)
assert arrow_array.num_chunks == 2
return type(arr)(arrow_array)
@pytest.fixture(params=[True, False])
def chunked(request):
return request.param
@pytest.fixture
def dtype(string_storage):
return StringDtype(storage=string_storage)
@pytest.fixture
def data(dtype, chunked):
strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100)
while strings[0] == strings[1]:
strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=100)
arr = dtype.construct_array_type()._from_sequence(strings, dtype=dtype)
return maybe_split_array(arr, chunked)
@pytest.fixture
def data_missing(dtype, chunked):
"""Length 2 array with [NA, Valid]"""
arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"], dtype=dtype)
return maybe_split_array(arr, chunked)
@pytest.fixture
def data_for_sorting(dtype, chunked):
arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"], dtype=dtype)
return maybe_split_array(arr, chunked)
@pytest.fixture
def data_missing_for_sorting(dtype, chunked):
arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"], dtype=dtype)
return maybe_split_array(arr, chunked)
@pytest.fixture
def data_for_grouping(dtype, chunked):
arr = dtype.construct_array_type()._from_sequence(
["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"], dtype=dtype
)
return maybe_split_array(arr, chunked)
class TestStringArray(base.ExtensionTests):
def test_eq_with_str(self, dtype):
assert dtype == f"string[{dtype.storage}]"
super().test_eq_with_str(dtype)
def test_is_not_string_type(self, dtype):
# Different from BaseDtypeTests.test_is_not_string_type
# because StringDtype is a string type
assert is_string_dtype(dtype)
def test_view(self, data, request, arrow_string_storage):
if data.dtype.storage in arrow_string_storage:
pytest.skip(reason="2D support not implemented for ArrowStringArray")
super().test_view(data)
def test_from_dtype(self, data):
# base test uses string representation of dtype
pass
def test_transpose(self, data, request, arrow_string_storage):
if data.dtype.storage in arrow_string_storage:
pytest.skip(reason="2D support not implemented for ArrowStringArray")
super().test_transpose(data)
def test_setitem_preserves_views(self, data, request, arrow_string_storage):
if data.dtype.storage in arrow_string_storage:
pytest.skip(reason="2D support not implemented for ArrowStringArray")
super().test_setitem_preserves_views(data)
def test_dropna_array(self, data_missing):
result = data_missing.dropna()
expected = data_missing[[1]]
tm.assert_extension_array_equal(result, expected)
def test_fillna_no_op_returns_copy(self, data):
data = data[~data.isna()]
valid = data[0]
result = data.fillna(valid)
assert result is not data
tm.assert_extension_array_equal(result, data)
result = data.fillna(method="backfill")
assert result is not data
tm.assert_extension_array_equal(result, data)
def _get_expected_exception(
self, op_name: str, obj, other
) -> type[Exception] | None:
if op_name in ["__divmod__", "__rdivmod__"]:
if isinstance(obj, pd.Series) and cast(
StringDtype, tm.get_dtype(obj)
).storage in [
"pyarrow",
"pyarrow_numpy",
]:
# TODO: re-raise as TypeError?
return NotImplementedError
elif isinstance(other, pd.Series) and cast(
StringDtype, tm.get_dtype(other)
).storage in [
"pyarrow",
"pyarrow_numpy",
]:
# TODO: re-raise as TypeError?
return NotImplementedError
return TypeError
elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
if cast(StringDtype, tm.get_dtype(obj)).storage in [
"pyarrow",
"pyarrow_numpy",
]:
return NotImplementedError
return TypeError
elif op_name in ["__mul__", "__rmul__"]:
# Can only multiply strings by integers
return TypeError
elif op_name in [
"__truediv__",
"__rtruediv__",
"__floordiv__",
"__rfloordiv__",
"__sub__",
"__rsub__",
]:
if cast(StringDtype, tm.get_dtype(obj)).storage in [
"pyarrow",
"pyarrow_numpy",
]:
import pyarrow as pa
# TODO: better to re-raise as TypeError?
return pa.ArrowNotImplementedError
return TypeError
return None
def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
return (
op_name in ["min", "max"]
or ser.dtype.storage == "pyarrow_numpy" # type: ignore[union-attr]
and op_name in ("any", "all")
)
def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
dtype = cast(StringDtype, tm.get_dtype(obj))
if op_name in ["__add__", "__radd__"]:
cast_to = dtype
elif dtype.storage == "pyarrow":
cast_to = "boolean[pyarrow]" # type: ignore[assignment]
elif dtype.storage == "pyarrow_numpy":
cast_to = np.bool_ # type: ignore[assignment]
else:
cast_to = "boolean" # type: ignore[assignment]
return pointwise_result.astype(cast_to)
def test_compare_scalar(self, data, comparison_op):
ser = pd.Series(data)
self._compare_other(ser, data, comparison_op, "abc")
@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
class Test2DCompat(base.Dim2CompatTests):
@pytest.fixture(autouse=True)
def arrow_not_supported(self, data):
if isinstance(data, ArrowStringArray):
pytest.skip(reason="2D support not implemented for ArrowStringArray")
def test_searchsorted_with_na_raises(data_for_sorting, as_series):
# GH50447
b, c, a = data_for_sorting
arr = data_for_sorting.take([2, 0, 1]) # to get [a, b, c]
arr[-1] = pd.NA
if as_series:
arr = pd.Series(arr)
msg = (
"searchsorted requires array to be sorted, "
"which is impossible with NAs present."
)
with pytest.raises(ValueError, match=msg):
arr.searchsorted(b)