This commit is contained in:
2024-12-04 13:35:57 +05:00
parent d346bf4b2a
commit 73ce681a55
7059 changed files with 1196501 additions and 0 deletions

View File

@ -0,0 +1,139 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def left_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
@pytest.fixture
def right_array():
"""Fixture returning boolean array with valid and missing values."""
return pd.array([True, False, None] * 3, dtype="boolean")
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [True, True, None, True, False, None, None, None, None]),
("mul", [True, False, None, False, False, None, None, None, None]),
],
ids=["add", "mul"],
)
def test_add_mul(left_array, right_array, opname, exp):
op = getattr(operator, opname)
result = op(left_array, right_array)
expected = pd.array(exp, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_sub(left_array, right_array):
msg = (
r"numpy boolean subtract, the `-` operator, is (?:deprecated|not supported), "
r"use the bitwise_xor, the `\^` operator, or the logical_xor function instead\."
)
with pytest.raises(TypeError, match=msg):
left_array - right_array
def test_div(left_array, right_array):
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
# check that we are matching the non-masked Series behavior
pd.Series(left_array._data) / pd.Series(right_array._data)
with pytest.raises(NotImplementedError, match=msg):
left_array / right_array
@pytest.mark.parametrize(
"opname",
[
"floordiv",
"mod",
"pow",
],
)
def test_op_int8(left_array, right_array, opname):
op = getattr(operator, opname)
if opname != "mod":
msg = "operator '.*' not implemented for bool dtypes"
with pytest.raises(NotImplementedError, match=msg):
result = op(left_array, right_array)
return
result = op(left_array, right_array)
expected = op(left_array.astype("Int8"), right_array.astype("Int8"))
tm.assert_extension_array_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
# invalid ops
if using_infer_string:
import pyarrow as pa
err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
err = TypeError
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
# invalid scalars
msg = (
"did not contain a loop with signature matching types|"
"BooleanArray cannot perform the operation|"
"not supported for the input types, and the inputs could not be safely coerced "
"to any supported types according to the casting rule ''safe''"
)
with pytest.raises(TypeError, match=msg):
ops("foo")
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
]
)
with pytest.raises(err, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
if op not in ("__mul__", "__rmul__"):
# TODO(extension) numpy's mul with object array sees booleans as numbers
msg = "|".join(
[
r"unsupported operand type\(s\) for",
"can only concatenate str",
"not all arguments converted during string formatting",
"has no kernel",
"not implemented",
]
)
with pytest.raises(err, match=msg):
ops(pd.Series("foo", index=s.index))

View File

@ -0,0 +1,53 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("str")
expected = np.array(["True", "False", "<NA>"], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.astype("int64")
expected = np.array([1, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("boolean")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, arr)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([True, False, None], dtype="boolean")
result = arr.astype("Int64")
expected = pd.array([1, 0, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,60 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.tests.arrays.masked_shared import ComparisonOps
@pytest.fixture
def data():
"""Fixture returning boolean array with valid and missing data"""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.fixture
def dtype():
"""Fixture returning BooleanDtype"""
return pd.BooleanDtype()
class TestComparisonOps(ComparisonOps):
def test_compare_scalar(self, data, comparison_op):
self._compare_other(data, comparison_op, True)
def test_compare_array(self, data, comparison_op):
other = pd.array([True] * len(data), dtype="boolean")
self._compare_other(data, comparison_op, other)
other = np.array([True] * len(data))
self._compare_other(data, comparison_op, other)
other = pd.Series([True] * len(data))
self._compare_other(data, comparison_op, other)
@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_array(self, comparison_op):
op = comparison_op
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = op(a, b)
values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)

View File

@ -0,0 +1,325 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.arrays.boolean import coerce_to_array
def test_boolean_array_constructor():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.tolist(), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, mask.tolist())
with pytest.raises(TypeError, match="values should be boolean numpy array"):
BooleanArray(values.astype(int), mask)
with pytest.raises(TypeError, match="mask should be boolean numpy array"):
BooleanArray(values, None)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values.reshape(1, -1), mask)
with pytest.raises(ValueError, match="values.shape must match mask.shape"):
BooleanArray(values, mask.reshape(1, -1))
def test_boolean_array_constructor_copy():
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(values, mask)
assert result._data is values
assert result._mask is mask
result = BooleanArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_boolean_array():
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, False])
)
result = pd.array([True, False, True], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True]), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, True], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
expected = BooleanArray(
np.array([True, False, True]), np.array([False, False, True])
)
result = pd.array([True, False, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([True, False, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_all_none():
expected = BooleanArray(np.array([True, True, True]), np.array([True, True, True]))
result = pd.array([None, None, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = pd.array(np.array([None, None, None], dtype=object), dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
["1", "2"],
# "foo",
[1, 2],
[1.0, 2.0],
pd.date_range("20130101", periods=2),
np.array(["foo"]),
np.array([1, 2]),
np.array([1.0, 2.0]),
[np.nan, {"a": 1}],
],
)
def test_to_boolean_array_error(values):
# error in converting existing arrays to BooleanArray
msg = "Need to pass bool-like value"
with pytest.raises(TypeError, match=msg):
pd.array(values, dtype="boolean")
def test_to_boolean_array_from_integer_array():
result = pd.array(np.array([1, 0, 1, 0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1, 0, 1, None]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_float_array():
result = pd.array(np.array([1.0, 0.0, 1.0, 0.0]), dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array(np.array([1.0, 0.0, 1.0, np.nan]), dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_integer_like():
# integers of 0's and 1's
result = pd.array([1, 0, 1, 0], dtype="boolean")
expected = pd.array([True, False, True, False], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
# with missing values
result = pd.array([1, 0, 1, None], dtype="boolean")
expected = pd.array([True, False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
def test_coerce_to_array():
# TODO this is currently not public API
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is values
assert result._mask is mask
result = BooleanArray(*coerce_to_array(values, mask=mask, copy=True))
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)
assert result._data is not values
assert result._mask is not mask
# mixed missing from values and mask
values = [True, False, None, False]
mask = np.array([False, False, False, True], dtype="bool")
result = BooleanArray(*coerce_to_array(values, mask=mask))
expected = BooleanArray(
np.array([True, False, True, True]), np.array([False, False, True, True])
)
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(np.array(values, dtype=object), mask=mask))
tm.assert_extension_array_equal(result, expected)
result = BooleanArray(*coerce_to_array(values, mask=mask.tolist()))
tm.assert_extension_array_equal(result, expected)
# raise errors for wrong dimension
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
# passing 2D values is OK as long as no mask
coerce_to_array(values.reshape(1, -1))
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values.reshape(1, -1), mask=mask)
with pytest.raises(ValueError, match="values.shape and mask.shape must match"):
coerce_to_array(values, mask=mask.reshape(1, -1))
def test_coerce_to_array_from_boolean_array():
# passing BooleanArray to coerce_to_array
values = np.array([True, False, True, False], dtype="bool")
mask = np.array([False, False, False, True], dtype="bool")
arr = BooleanArray(values, mask)
result = BooleanArray(*coerce_to_array(arr))
tm.assert_extension_array_equal(result, arr)
# no copy
assert result._data is arr._data
assert result._mask is arr._mask
result = BooleanArray(*coerce_to_array(arr), copy=True)
tm.assert_extension_array_equal(result, arr)
assert result._data is not arr._data
assert result._mask is not arr._mask
with pytest.raises(ValueError, match="cannot pass mask for BooleanArray input"):
coerce_to_array(arr, mask=mask)
def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
# also with no missing values -> object dtype
arr = pd.array([True, False, True], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# force bool dtype
result = np.array(arr, dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
# with missing values will raise error
arr = pd.array([True, False, None], dtype="boolean")
msg = (
"cannot convert to 'bool'-dtype NumPy array with missing values. "
"Specify an appropriate 'na_value' for this dtype."
)
with pytest.raises(ValueError, match=msg):
np.array(arr, dtype="bool")
def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object),
dtype="boolean",
)
expected = BooleanArray(
np.array([True, False, True, True, False, False, False]),
np.array([False, False, False, False, False, False, True]),
)
tm.assert_extension_array_equal(result, expected)
def test_to_boolean_array_from_strings_invalid_string():
with pytest.raises(ValueError, match="cannot be cast"):
BooleanArray._from_sequence_of_strings(["donkey"], dtype="boolean")
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy()
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype="str")
expected = np.array([True, False, pd.NA], dtype=f"{tm.ENDIAN}U5")
tm.assert_numpy_array_equal(result, expected)
# no missing values -> can convert to bool, otherwise raises
arr = con([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype="bool")
expected = np.array([True, False, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
arr = con([True, False, None], dtype="boolean")
with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"):
result = arr.to_numpy(dtype="bool")
# specify dtype and na_value
arr = con([True, False, None], dtype="boolean")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([True, False, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([True, False, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([1, 0, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([1, 0, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# converting to int or float without specifying na_value raises
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
arr.to_numpy(dtype="int64")
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool)
result[0] = False
tm.assert_extension_array_equal(
arr, pd.array([False, False, True], dtype="boolean")
)
arr = pd.array([True, False, True], dtype="boolean")
result = arr.to_numpy(dtype=bool, copy=True)
result[0] = False
tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean"))

View File

@ -0,0 +1,126 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
def test_ufuncs_binary(ufunc):
# two BooleanArrays
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a, a)
expected = pd.array(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s, a)
expected = pd.Series(ufunc(a._data, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
# Boolean with numpy array
arr = np.array([True, True, False])
result = ufunc(a, arr)
expected = pd.array(ufunc(a._data, arr), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# BooleanArray with scalar
result = ufunc(a, True)
expected = pd.array(ufunc(a._data, True), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
result = ufunc(True, a)
expected = pd.array(ufunc(True, a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
# not handled types
msg = r"operand type\(s\) all returned NotImplemented from __array_ufunc__"
with pytest.raises(TypeError, match=msg):
ufunc(a, "test")
@pytest.mark.parametrize("ufunc", [np.logical_not])
def test_ufuncs_unary(ufunc):
a = pd.array([True, False, None], dtype="boolean")
result = ufunc(a)
expected = pd.array(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ufunc(ser)
expected = pd.Series(ufunc(a._data), dtype="boolean")
expected[a._mask] = np.nan
tm.assert_series_equal(result, expected)
def test_ufunc_numeric():
# np.sqrt on np.bool_ returns float16, which we upcast to Float32
# bc we do not have Float16
arr = pd.array([True, False, None], dtype="boolean")
res = np.sqrt(arr)
expected = pd.array([1, 0, None], dtype="Float32")
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("values", [[True, False], [True, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="boolean")
res = np.add.reduce(arr)
if arr[-1] is pd.NA:
expected = pd.NA
else:
expected = arr._data.sum()
tm.assert_almost_equal(res, expected)
def test_value_counts_na():
arr = pd.array([True, False, pd.NA], dtype="boolean")
result = arr.value_counts(dropna=False)
expected = pd.Series([1, 1, 1], index=arr, dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([1, 1], index=arr[:-1], dtype="Int64", name="count")
assert expected.index.dtype == arr.dtype
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([True, False, pd.NA], dtype="boolean")
result = ser.value_counts(normalize=True)
expected = pd.Series([1, 1], index=ser[:-1], dtype="Float64", name="proportion") / 2
assert expected.index.dtype == "boolean"
tm.assert_series_equal(result, expected)
def test_diff():
a = pd.array(
[True, True, False, False, True, None, True, None, False], dtype="boolean"
)
result = pd.core.algorithms.diff(a, 1)
expected = pd.array(
[None, False, True, False, True, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
ser = pd.Series(a)
result = ser.diff()
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,13 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)

View File

@ -0,0 +1,254 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.arrays import BooleanArray
from pandas.core.ops.mask_ops import (
kleene_and,
kleene_or,
kleene_xor,
)
from pandas.tests.extension.base import BaseOpsUtil
class TestLogicalOps(BaseOpsUtil):
def test_numpy_scalars_ok(self, all_logical_operators):
a = pd.array([True, False, None], dtype="boolean")
op = getattr(a, all_logical_operators)
tm.assert_extension_array_equal(op(True), op(np.bool_(True)))
tm.assert_extension_array_equal(op(False), op(np.bool_(False)))
def get_op_from_name(self, op_name):
short_opname = op_name.strip("_")
short_opname = short_opname if "xor" in short_opname else short_opname + "_"
try:
op = getattr(operator, short_opname)
except AttributeError:
# Assume it is the reverse operator
rop = getattr(operator, short_opname[1:])
op = lambda x, y: rop(y, x)
return op
def test_empty_ok(self, all_logical_operators):
a = pd.array([], dtype="boolean")
op_name = all_logical_operators
result = getattr(a, op_name)(True)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(False)
tm.assert_extension_array_equal(a, result)
result = getattr(a, op_name)(pd.NA)
tm.assert_extension_array_equal(a, result)
@pytest.mark.parametrize(
"other", ["a", pd.Timestamp(2017, 1, 1, 12), np.timedelta64(4)]
)
def test_eq_mismatched_type(self, other):
# GH-44499
arr = pd.array([True, False])
result = arr == other
expected = pd.array([False, False])
tm.assert_extension_array_equal(result, expected)
result = arr != other
expected = pd.array([True, True])
tm.assert_extension_array_equal(result, expected)
def test_logical_length_mismatch_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Lengths must match"
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)([True, False])
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(np.array([True, False]))
with pytest.raises(ValueError, match=msg):
getattr(a, op_name)(pd.array([True, False], dtype="boolean"))
def test_logical_nan_raises(self, all_logical_operators):
op_name = all_logical_operators
a = pd.array([True, False, None], dtype="boolean")
msg = "Got float instead"
with pytest.raises(TypeError, match=msg):
getattr(a, op_name)(np.nan)
@pytest.mark.parametrize("other", ["a", 1])
def test_non_bool_or_na_other_raises(self, other, all_logical_operators):
a = pd.array([True, False], dtype="boolean")
with pytest.raises(TypeError, match=str(type(other).__name__)):
getattr(a, all_logical_operators)(other)
def test_kleene_or(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a | b
expected = pd.array(
[True, True, True, True, False, None, True, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [True, None, None]),
(True, [True, True, True]),
(np.bool_(True), [True, True, True]),
(False, [True, False, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_or_scalar(self, other, expected):
# TODO: test True & False
a = pd.array([True, False, None], dtype="boolean")
result = a | other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other | a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_and(self):
# A clear test of behavior.
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a & b
expected = pd.array(
[True, False, None, False, False, False, None, False, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, False, None]),
(True, [True, False, None]),
(False, [False, False, False]),
(np.bool_(True), [True, False, None]),
(np.bool_(False), [False, False, False]),
],
)
def test_kleene_and_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a & other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other & a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
def test_kleene_xor(self):
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")
result = a ^ b
expected = pd.array(
[False, True, None, True, False, None, None, None, None], dtype="boolean"
)
tm.assert_extension_array_equal(result, expected)
result = b ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)
@pytest.mark.parametrize(
"other, expected",
[
(pd.NA, [None, None, None]),
(True, [False, True, None]),
(np.bool_(True), [False, True, None]),
(np.bool_(False), [True, False, None]),
],
)
def test_kleene_xor_scalar(self, other, expected):
a = pd.array([True, False, None], dtype="boolean")
result = a ^ other
expected = pd.array(expected, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = other ^ a
tm.assert_extension_array_equal(result, expected)
# ensure we haven't mutated anything inplace
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)
@pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3])
def test_no_masked_assumptions(self, other, all_logical_operators):
# The logical operations should not assume that masked values are False!
a = pd.arrays.BooleanArray(
np.array([True, True, True, False, False, False, True, False, True]),
np.array([False] * 6 + [True, True, True]),
)
b = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
if isinstance(other, list):
other = pd.array(other, dtype="boolean")
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
if isinstance(other, BooleanArray):
other._data[other._mask] = True
a._data[a._mask] = False
result = getattr(a, all_logical_operators)(other)
expected = getattr(b, all_logical_operators)(other)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("operation", [kleene_or, kleene_xor, kleene_and])
def test_error_both_scalar(operation):
msg = r"Either `left` or `right` need to be a np\.ndarray."
with pytest.raises(TypeError, match=msg):
# masks need to be non-None, otherwise it ends up in an infinite recursion
operation(True, True, np.zeros(1), np.zeros(1))

View File

@ -0,0 +1,27 @@
import pandas as pd
import pandas._testing as tm
class TestUnaryOps:
def test_invert(self):
a = pd.array([True, False, None], dtype="boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(~a, expected)
expected = pd.Series(expected, index=["a", "b", "c"], name="name")
result = ~pd.Series(a, index=["a", "b", "c"], name="name")
tm.assert_series_equal(result, expected)
df = pd.DataFrame({"A": a, "B": [True, False, False]}, index=["a", "b", "c"])
result = ~df
expected = pd.DataFrame(
{"A": expected, "B": [False, True, True]}, index=["a", "b", "c"]
)
tm.assert_frame_equal(result, expected)
def test_abs(self):
# matching numpy behavior, abs is the identity function
arr = pd.array([True, False, None], dtype="boolean")
result = abs(arr)
tm.assert_extension_array_equal(result, arr)

View File

@ -0,0 +1,62 @@
import numpy as np
import pytest
import pandas as pd
@pytest.fixture
def data():
"""Fixture returning boolean array, with valid and missing values."""
return pd.array(
[True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False],
dtype="boolean",
)
@pytest.mark.parametrize(
"values, exp_any, exp_all, exp_any_noskip, exp_all_noskip",
[
([True, pd.NA], True, True, True, pd.NA),
([False, pd.NA], False, False, pd.NA, False),
([pd.NA], False, True, pd.NA, pd.NA),
([], False, True, False, True),
# GH-33253: all True / all False values buggy with skipna=False
([True, True], True, True, True, True),
([False, False], False, False, False, False),
],
)
def test_any_all(values, exp_any, exp_all, exp_any_noskip, exp_all_noskip):
# the methods return numpy scalars
exp_any = pd.NA if exp_any is pd.NA else np.bool_(exp_any)
exp_all = pd.NA if exp_all is pd.NA else np.bool_(exp_all)
exp_any_noskip = pd.NA if exp_any_noskip is pd.NA else np.bool_(exp_any_noskip)
exp_all_noskip = pd.NA if exp_all_noskip is pd.NA else np.bool_(exp_all_noskip)
for con in [pd.array, pd.Series]:
a = con(values, dtype="boolean")
assert a.any() is exp_any
assert a.all() is exp_all
assert a.any(skipna=False) is exp_any_noskip
assert a.all(skipna=False) is exp_all_noskip
assert np.any(a.any()) is exp_any
assert np.all(a.all()) is exp_all
@pytest.mark.parametrize("dropna", [True, False])
def test_reductions_return_types(dropna, data, all_numeric_reductions):
op = all_numeric_reductions
s = pd.Series(data)
if dropna:
s = s.dropna()
if op in ("sum", "prod"):
assert isinstance(getattr(s, op)(), np.int_)
elif op == "count":
# Oddly on the 32 bit build (but not Windows), this is intc (!= intp)
assert isinstance(getattr(s, op)(), np.integer)
elif op in ("min", "max"):
assert isinstance(getattr(s, op)(), np.bool_)
else:
# "mean", "std", "var", "median", "kurt", "skew"
assert isinstance(getattr(s, op)(), np.float64)

View File

@ -0,0 +1,13 @@
import pandas as pd
def test_repr():
df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")})
expected = " A\n0 True\n1 False\n2 <NA>"
assert repr(df) == expected
expected = "0 True\n1 False\n2 <NA>\nName: A, dtype: boolean"
assert repr(df.A) == expected
expected = "<BooleanArray>\n[True, False, <NA>]\nLength: 3, dtype: boolean"
assert repr(df.A.array) == expected

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("categories", [["b", "a", "c"], ["a", "b", "c", "d"]])
def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_factorized_sort_ordered():
cat = pd.Categorical(
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)
def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])
result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("value", [[""], [None, ""], [pd.NaT, ""]])
def test_isin_cats_corner_cases(value):
# GH36550
cat = pd.Categorical([""])
result = cat.isin(value)
expected = np.array([True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)
@pytest.mark.parametrize("empty", [[], pd.Series(dtype=object), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)
result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)
def test_diff():
ser = pd.Series([1, 2, 3], dtype="category")
msg = "Convert to a suitable dtype"
with pytest.raises(TypeError, match=msg):
ser.diff()
df = ser.to_frame(name="A")
with pytest.raises(TypeError, match=msg):
df.diff()

View File

@ -0,0 +1,349 @@
import re
import sys
import numpy as np
import pytest
from pandas.compat import PYPY
from pandas import (
Categorical,
CategoricalDtype,
DataFrame,
Index,
NaT,
Series,
date_range,
)
import pandas._testing as tm
from pandas.api.types import is_scalar
class TestCategoricalAnalytics:
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_not_ordered_raises(self, aggregation):
# unordered cats have no min/max
cat = Categorical(["a", "b", "c", "d"], ordered=False)
msg = f"Categorical is not ordered for operation {aggregation}"
agg_func = getattr(cat, aggregation)
with pytest.raises(TypeError, match=msg):
agg_func()
ufunc = np.minimum if aggregation == "min" else np.maximum
with pytest.raises(TypeError, match=msg):
ufunc.reduce(cat)
def test_min_max_ordered(self, index_or_series_or_array):
cat = Categorical(["a", "b", "c", "d"], ordered=True)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "a"
assert _max == "d"
assert np.minimum.reduce(obj) == "a"
assert np.maximum.reduce(obj) == "d"
# TODO: raises if we pass axis=0 (on Index and Categorical, not Series)
cat = Categorical(
["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
)
obj = index_or_series_or_array(cat)
_min = obj.min()
_max = obj.max()
assert _min == "d"
assert _max == "a"
assert np.minimum.reduce(obj) == "d"
assert np.maximum.reduce(obj) == "a"
def test_min_max_reduce(self):
# GH52788
cat = Categorical(["a", "b", "c", "d"], ordered=True)
df = DataFrame(cat)
result_max = df.agg("max")
expected_max = Series(Categorical(["d"], dtype=cat.dtype))
tm.assert_series_equal(result_max, expected_max)
result_min = df.agg("min")
expected_min = Series(Categorical(["a"], dtype=cat.dtype))
tm.assert_series_equal(result_min, expected_min)
@pytest.mark.parametrize(
"categories,expected",
[
(list("ABC"), np.nan),
([1, 2, 3], np.nan),
pytest.param(
Series(date_range("2020-01-01", periods=3), dtype="category"),
NaT,
marks=pytest.mark.xfail(
reason="https://github.com/pandas-dev/pandas/issues/29962"
),
),
],
)
@pytest.mark.parametrize("aggregation", ["min", "max"])
def test_min_max_ordered_empty(self, categories, expected, aggregation):
# GH 30227
cat = Categorical([], categories=categories, ordered=True)
agg_func = getattr(cat, aggregation)
result = agg_func()
assert result is expected
@pytest.mark.parametrize(
"values, categories",
[(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("function", ["min", "max"])
def test_min_max_with_nan(self, values, categories, function, skipna):
# GH 25303
cat = Categorical(values, categories=categories, ordered=True)
result = getattr(cat, function)(skipna=skipna)
if skipna is False:
assert result is np.nan
else:
expected = categories[0] if function == "min" else categories[2]
assert result == expected
@pytest.mark.parametrize("function", ["min", "max"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_only_nan(self, function, skipna):
# https://github.com/pandas-dev/pandas/issues/33450
cat = Categorical([np.nan], categories=[1, 2], ordered=True)
result = getattr(cat, function)(skipna=skipna)
assert result is np.nan
@pytest.mark.parametrize("method", ["min", "max"])
def test_numeric_only_min_max_raises(self, method):
# GH 25303
cat = Categorical(
[np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
)
with pytest.raises(TypeError, match=".* got an unexpected keyword"):
getattr(cat, method)(numeric_only=True)
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_raises(self, method):
cat = Categorical(["a", "b", "c", "b"], ordered=False)
msg = (
f"Categorical is not ordered for operation {method}\n"
"you can use .as_ordered() to change the Categorical to an ordered one"
)
method = getattr(np, method)
with pytest.raises(TypeError, match=re.escape(msg)):
method(cat)
@pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
@pytest.mark.parametrize("method", ["min", "max"])
def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
msg = (
f"the '{kwarg}' parameter is not supported in the pandas implementation "
f"of {method}"
)
if kwarg == "axis":
msg = r"`axis` must be fewer than the number of dimensions \(1\)"
kwargs = {kwarg: 42}
method = getattr(np, method)
with pytest.raises(ValueError, match=msg):
method(cat, **kwargs)
@pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
def test_numpy_min_max_axis_equals_none(self, method, expected):
cat = Categorical(["a", "b", "c", "b"], ordered=True)
method = getattr(np, method)
result = method(cat, axis=None)
assert result == expected
@pytest.mark.parametrize(
"values,categories,exp_mode",
[
([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
],
)
def test_mode(self, values, categories, exp_mode):
cat = Categorical(values, categories=categories, ordered=True)
res = Series(cat).mode()._values
exp = Categorical(exp_mode, categories=categories, ordered=True)
tm.assert_categorical_equal(res, exp)
def test_searchsorted(self, ordered):
# https://github.com/pandas-dev/pandas/issues/8420
# https://github.com/pandas-dev/pandas/issues/14522
cat = Categorical(
["cheese", "milk", "apple", "bread", "bread"],
categories=["cheese", "milk", "apple", "bread"],
ordered=ordered,
)
ser = Series(cat)
# Searching for single item argument, side='left' (default)
res_cat = cat.searchsorted("apple")
assert res_cat == 2
assert is_scalar(res_cat)
res_ser = ser.searchsorted("apple")
assert res_ser == 2
assert is_scalar(res_ser)
# Searching for single item array, side='left' (default)
res_cat = cat.searchsorted(["bread"])
res_ser = ser.searchsorted(["bread"])
exp = np.array([3], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for several items array, side='right'
res_cat = cat.searchsorted(["apple", "bread"], side="right")
res_ser = ser.searchsorted(["apple", "bread"], side="right")
exp = np.array([3, 5], dtype=np.intp)
tm.assert_numpy_array_equal(res_cat, exp)
tm.assert_numpy_array_equal(res_ser, exp)
# Searching for a single value that is not from the Categorical
with pytest.raises(TypeError, match="cucumber"):
cat.searchsorted("cucumber")
with pytest.raises(TypeError, match="cucumber"):
ser.searchsorted("cucumber")
# Searching for multiple values one of each is not from the Categorical
msg = (
"Cannot setitem on a Categorical with a new category, "
"set the categories first"
)
with pytest.raises(TypeError, match=msg):
cat.searchsorted(["bread", "cucumber"])
with pytest.raises(TypeError, match=msg):
ser.searchsorted(["bread", "cucumber"])
def test_unique(self, ordered):
# GH38140
dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
# categories are reordered based on value when ordered=False
cat = Categorical(["a", "b", "c"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, cat)
cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
# nan must be removed
cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
res = cat.unique()
exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
tm.assert_categorical_equal(res, exp_cat)
def test_unique_index_series(self, ordered):
# GH38140
dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
# Categorical.unique sorts categories by appearance order
# if ordered=False
exp = Categorical([3, 1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
c = Categorical([1, 1, 2, 2], dtype=dtype)
exp = Categorical([1, 2], dtype=dtype)
tm.assert_categorical_equal(c.unique(), exp)
tm.assert_index_equal(Index(c).unique(), Index(exp))
tm.assert_categorical_equal(Series(c).unique(), exp)
def test_shift(self):
# GH 9416
cat = Categorical(["a", "b", "c", "d", "a"])
# shift forward
sp1 = cat.shift(1)
xp1 = Categorical([np.nan, "a", "b", "c", "d"])
tm.assert_categorical_equal(sp1, xp1)
tm.assert_categorical_equal(cat[:-1], sp1[1:])
# shift back
sn2 = cat.shift(-2)
xp2 = Categorical(
["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
)
tm.assert_categorical_equal(sn2, xp2)
tm.assert_categorical_equal(cat[2:], sn2[:-2])
# shift by zero
tm.assert_categorical_equal(cat, cat.shift(0))
def test_nbytes(self):
cat = Categorical([1, 2, 3])
exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
assert cat.nbytes == exp
def test_memory_usage(self):
cat = Categorical([1, 2, 3])
# .categories is an index, so we include the hashtable
assert 0 < cat.nbytes <= cat.memory_usage()
assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
cat = Categorical(["foo", "foo", "bar"])
assert cat.memory_usage(deep=True) > cat.nbytes
if not PYPY:
# sys.getsizeof will call the .memory_usage with
# deep=True, and add on some GC overhead
diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
assert abs(diff) < 100
def test_map(self):
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
result = c.map(lambda x: x.lower(), na_action=None)
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
tm.assert_categorical_equal(result, exp)
result = c.map(lambda x: 1, na_action=None)
# GH 12766: Return an index not an array
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
def test_validate_inplace_raises(self, value):
cat = Categorical(["A", "B", "B", "C", "A"])
msg = (
'For argument "inplace" expected type bool, '
f"received type {type(value).__name__}"
)
with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)
def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8
expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,501 @@
import re
import numpy as np
import pytest
from pandas.compat import PY311
from pandas import (
Categorical,
CategoricalIndex,
DataFrame,
Index,
Series,
StringDtype,
)
import pandas._testing as tm
from pandas.core.arrays.categorical import recode_for_categories
class TestCategoricalAPI:
def test_to_list_deprecated(self):
# GH#51254
cat1 = Categorical(list("acb"), ordered=False)
msg = "Categorical.to_list is deprecated and will be removed"
with tm.assert_produces_warning(FutureWarning, match=msg):
cat1.to_list()
def test_ordered_api(self):
# GH 9347
cat1 = Categorical(list("acb"), ordered=False)
tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
assert not cat1.ordered
cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
assert not cat2.ordered
cat3 = Categorical(list("acb"), ordered=True)
tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
assert cat3.ordered
cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
assert cat4.ordered
def test_set_ordered(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
cat2 = cat.as_unordered()
assert not cat2.ordered
cat2 = cat.as_ordered()
assert cat2.ordered
assert cat2.set_ordered(True).ordered
assert not cat2.set_ordered(False).ordered
# removed in 0.19.0
msg = (
"property 'ordered' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
cat.ordered = True
with pytest.raises(AttributeError, match=msg):
cat.ordered = False
def test_rename_categories(self):
cat = Categorical(["a", "b", "c", "a"])
# inplace=False: the old one must not be changed
res = cat.rename_categories([1, 2, 3])
tm.assert_numpy_array_equal(
res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
)
tm.assert_index_equal(res.categories, Index([1, 2, 3]))
exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
exp_cat = Index(["a", "b", "c"])
tm.assert_index_equal(cat.categories, exp_cat)
# GH18862 (let rename_categories take callables)
result = cat.rename_categories(lambda x: x.upper())
expected = Categorical(["A", "B", "C", "A"])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_rename_categories_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items as the "
"old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
def test_rename_categories_series(self):
# https://github.com/pandas-dev/pandas/issues/17981
c = Categorical(["a", "b"])
result = c.rename_categories(Series([0, 1], index=["a", "b"]))
expected = Categorical([0, 1])
tm.assert_categorical_equal(result, expected)
def test_rename_categories_dict(self):
# GH 17336
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
expected = Index([4, 3, 2, 1])
tm.assert_index_equal(res.categories, expected)
# Test for dicts of smaller length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "c": 3})
expected = Index([1, "b", 3, "d"])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with bigger length
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
expected = Index([1, 2, 3, 4])
tm.assert_index_equal(res.categories, expected)
# Test for dicts with no items from old categories
cat = Categorical(["a", "b", "c", "d"])
res = cat.rename_categories({"f": 1, "g": 3})
expected = Index(["a", "b", "c", "d"])
tm.assert_index_equal(res.categories, expected)
def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
)
res = cat.reorder_categories(["c", "b", "a"])
# cat must be the same as before
tm.assert_categorical_equal(cat, old)
# only res is changed
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize(
"new_categories",
[
["a"], # not all "old" included in "new"
["a", "b", "d"], # still not all "old" in "new"
["a", "b", "c", "d"], # all "old" included in "new", but too long
],
)
def test_reorder_categories_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
msg = "items in new_categories are not the same as in old categories"
with pytest.raises(ValueError, match=msg):
cat.reorder_categories(new_categories)
def test_add_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(
["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.add_categories("d")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.add_categories(["d"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
# GH 9927
cat = Categorical(list("abc"), ordered=True)
expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
# test with Series, np.array, index, list
res = cat.add_categories(Series(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(np.array(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(Index(["d", "e"]))
tm.assert_categorical_equal(res, expected)
res = cat.add_categories(["d", "e"])
tm.assert_categorical_equal(res, expected)
def test_add_categories_existing_raises(self):
# new is in old categories
cat = Categorical(["a", "b", "c", "d"], ordered=True)
msg = re.escape("new categories must not include old categories: {'d'}")
with pytest.raises(ValueError, match=msg):
cat.add_categories(["d"])
def test_add_categories_losing_dtype_information(self):
# GH#48812
cat = Categorical(Series([1, 2], dtype="Int64"))
ser = Series([4], dtype="Int64")
result = cat.add_categories(ser)
expected = Categorical(
Series([1, 2], dtype="Int64"), categories=Series([1, 2, 4], dtype="Int64")
)
tm.assert_categorical_equal(result, expected)
cat = Categorical(Series(["a", "b", "a"], dtype=StringDtype()))
ser = Series(["d"], dtype=StringDtype())
result = cat.add_categories(ser)
expected = Categorical(
Series(["a", "b", "a"], dtype=StringDtype()),
categories=Series(["a", "b", "d"], dtype=StringDtype()),
)
tm.assert_categorical_equal(result, expected)
def test_set_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
exp_categories = Index(["c", "b", "a"])
exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
cat = cat.set_categories(["c", "b", "a"])
res = cat.set_categories(["a", "b", "c"])
# cat must be the same as before
tm.assert_index_equal(cat.categories, exp_categories)
tm.assert_numpy_array_equal(cat.__array__(), exp_values)
# only res is changed
exp_categories_back = Index(["a", "b", "c"])
tm.assert_index_equal(res.categories, exp_categories_back)
tm.assert_numpy_array_equal(res.__array__(), exp_values)
# not all "old" included in "new" -> all not included ones are now
# np.nan
cat = Categorical(["a", "b", "c", "a"], ordered=True)
res = cat.set_categories(["a"])
tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
# still not all "old" in "new"
res = cat.set_categories(["a", "b", "d"])
tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
# all "old" included in "new"
cat = cat.set_categories(["a", "b", "c", "d"])
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_index_equal(cat.categories, exp_categories)
# internals...
c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
# all "pointers" to '4' must be changed from 3 to 0,...
c = c.set_categories([4, 3, 2, 1])
# positions are changed
tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
# categories are now in new order
tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
# output is the same
exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
tm.assert_numpy_array_equal(np.asarray(c), exp)
assert c.min() == 4
assert c.max() == 1
# set_categories should set the ordering if specified
c2 = c.set_categories([4, 3, 2, 1], ordered=False)
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
# set_categories should pass thru the ordering
c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
assert not c2.ordered
tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self):
# GH 24675
cat = Categorical(["A", "B"])
result = cat.set_categories(["A"], rename=True)
expected = Categorical(["A", np.nan])
tm.assert_categorical_equal(result, expected)
def test_set_categories_private(self):
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"])
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
# fastpath
cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
cat._set_categories(["a", "c", "d", "e"], fastpath=True)
expected = Categorical(["a", "c", "d"], categories=list("acde"))
tm.assert_categorical_equal(cat, expected)
def test_remove_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
res = cat.remove_categories("c")
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
res = cat.remove_categories(["c"])
tm.assert_categorical_equal(cat, old)
tm.assert_categorical_equal(res, new)
@pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
def test_remove_categories_raises(self, removals):
cat = Categorical(["a", "b", "a"])
message = re.escape("removals must all be in old categories: {'c'}")
with pytest.raises(ValueError, match=message):
cat.remove_categories(removals)
def test_remove_unused_categories(self):
c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
exp_categories_all = Index(["a", "b", "c", "d", "e"])
exp_categories_dropped = Index(["a", "b", "c", "d"])
tm.assert_index_equal(c.categories, exp_categories_all)
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, exp_categories_dropped)
tm.assert_index_equal(c.categories, exp_categories_all)
# with NaN values (GH11599)
c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
res = c.remove_unused_categories()
tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(res.codes, exp_codes)
tm.assert_index_equal(c.categories, exp_categories_all)
val = ["F", np.nan, "D", "B", "D", "F", np.nan]
cat = Categorical(values=val, categories=list("ABCDEFG"))
out = cat.remove_unused_categories()
tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
tm.assert_numpy_array_equal(out.codes, exp_codes)
assert out.tolist() == val
alpha = list("abcdefghijklmnopqrstuvwxyz")
val = np.random.default_rng(2).choice(alpha[::2], 10000).astype("object")
val[np.random.default_rng(2).choice(len(val), 100)] = np.nan
cat = Categorical(values=val, categories=alpha)
out = cat.remove_unused_categories()
assert out.tolist() == val.tolist()
class TestCategoricalAPIWithFactor:
def test_describe(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# string type
desc = factor.describe()
assert factor.ordered
exp_index = CategoricalIndex(
["a", "b", "c"], name="categories", ordered=factor.ordered
)
expected = DataFrame(
{"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
)
tm.assert_frame_equal(desc, expected)
# check unused categories
cat = factor.copy()
cat = cat.set_categories(["a", "b", "c", "d"])
desc = cat.describe()
exp_index = CategoricalIndex(
list("abcd"), ordered=factor.ordered, name="categories"
)
expected = DataFrame(
{"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# check an integer one
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
desc = cat.describe()
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
expected = DataFrame(
{"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
index=exp_index,
)
tm.assert_frame_equal(desc, expected)
# https://github.com/pandas-dev/pandas/issues/3678
# describe should work with NaN
cat = Categorical([np.nan, 1, 2, 2])
desc = cat.describe()
expected = DataFrame(
{"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
index=CategoricalIndex(
[1, 2, np.nan], categories=[1, 2], name="categories"
),
)
tm.assert_frame_equal(desc, expected)
class TestPrivateCategoricalAPI:
def test_codes_immutable(self):
# Codes should be read only
c = Categorical(["a", "b", "c", "a", np.nan])
exp = np.array([0, 1, 2, 0, -1], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
# Assignments to codes should raise
msg = (
"property 'codes' of 'Categorical' object has no setter"
if PY311
else "can't set attribute"
)
with pytest.raises(AttributeError, match=msg):
c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
# changes in the codes array should raise
codes = c.codes
with pytest.raises(ValueError, match="assignment destination is read-only"):
codes[4] = 1
# But even after getting the codes, the original array should still be
# writeable!
c[4] = "a"
exp = np.array([0, 1, 2, 0, 0], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
c._codes[4] = 2
exp = np.array([0, 1, 2, 0, 2], dtype="int8")
tm.assert_numpy_array_equal(c.codes, exp)
@pytest.mark.parametrize(
"codes, old, new, expected",
[
([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
([-1, -1], [], ["a", "b"], [-1, -1]),
([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
],
)
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

View File

@ -0,0 +1,155 @@
import numpy as np
import pytest
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
DatetimeIndex,
Interval,
NaT,
Period,
Timestamp,
array,
to_datetime,
)
import pandas._testing as tm
class TestAstype:
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("values", [[1, np.nan], [Timestamp("2000"), NaT]])
def test_astype_nan_to_int(self, cls, values):
# GH#28406
obj = cls(values)
msg = "Cannot (cast|convert)"
with pytest.raises((ValueError, TypeError), match=msg):
obj.astype(int)
@pytest.mark.parametrize(
"expected",
[
array(["2019", "2020"], dtype="datetime64[ns, UTC]"),
array([0, 0], dtype="timedelta64[ns]"),
array([Period("2019"), Period("2020")], dtype="period[Y-DEC]"),
array([Interval(0, 1), Interval(1, 2)], dtype="interval"),
array([1, np.nan], dtype="Int64"),
],
)
def test_astype_category_to_extension_dtype(self, expected):
# GH#28668
result = expected.astype("category").astype(expected.dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"dtype, expected",
[
(
"datetime64[ns]",
np.array(["2015-01-01T00:00:00.000000000"], dtype="datetime64[ns]"),
),
(
"datetime64[ns, MET]",
DatetimeIndex([Timestamp("2015-01-01 00:00:00+0100", tz="MET")]).array,
),
],
)
def test_astype_to_datetime64(self, dtype, expected):
# GH#28448
result = Categorical(["2015-01-01"]).astype(dtype)
assert result == expected
def test_astype_str_int_categories_to_nullable_int(self):
# GH#39616
dtype = CategoricalDtype([str(i) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Int64")
expected = array(codes, dtype="Int64")
tm.assert_extension_array_equal(res, expected)
def test_astype_str_int_categories_to_nullable_float(self):
# GH#39616
dtype = CategoricalDtype([str(i / 2) for i in range(5)])
codes = np.random.default_rng(2).integers(5, size=20)
arr = Categorical.from_codes(codes, dtype=dtype)
res = arr.astype("Float64")
expected = array(codes, dtype="Float64") / 2
tm.assert_extension_array_equal(res, expected)
@pytest.mark.parametrize("ordered", [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list("abbaaccc"), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)
msg = r"Cannot cast object|string dtype to float64"
with pytest.raises(ValueError, match=msg):
cat.astype(float)
# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(int)
expected = np.array(cat, dtype="int")
tm.assert_numpy_array_equal(result, expected)
result = cat.astype(float)
expected = np.array(cat, dtype=float)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype_ordered", [True, False])
@pytest.mark.parametrize("cat_ordered", [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH#10696/GH#18593
data = list("abcaacbab")
cat = Categorical(data, categories=list("bac"), ordered=cat_ordered)
# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)
# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype("category")
expected = cat
tm.assert_categorical_equal(result, expected)
def test_astype_object_datetime_categories(self):
# GH#40754
cat = Categorical(to_datetime(["2021-03-27", NaT]))
result = cat.astype(object)
expected = np.array([Timestamp("2021-03-27 00:00:00"), NaT], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_object_timestamp_categories(self):
# GH#18024
cat = Categorical([Timestamp("2014-01-01")])
result = cat.astype(object)
expected = np.array([Timestamp("2014-01-01 00:00:00")], dtype="object")
tm.assert_numpy_array_equal(result, expected)
def test_astype_category_readonly_mask_values(self):
# GH#53658
arr = array([0, 1, 2], dtype="Int64")
arr._mask.flags["WRITEABLE"] = False
result = arr.astype("category")
expected = array([0, 1, 2], dtype="Int64").astype("category")
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,783 @@
from datetime import (
date,
datetime,
)
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
CategoricalIndex,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
period_range,
timedelta_range,
)
import pandas._testing as tm
class TestCategoricalConstructors:
def test_fastpath_deprecated(self):
codes = np.array([1, 2, 3])
dtype = CategoricalDtype(categories=["a", "b", "c", "d"], ordered=False)
msg = "The 'fastpath' keyword in Categorical is deprecated"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
Categorical(codes, dtype=dtype, fastpath=True)
def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
# GH#49309 we should preserve orderedness in `res`
cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)
res = Categorical(cat, dtype="category")
assert res.dtype.ordered
def test_categorical_disallows_scalar(self):
# GH#38433
with pytest.raises(TypeError, match="Categorical input must be list-like"):
Categorical("A", categories=["A", "B"])
def test_categorical_1d_only(self):
# ndim > 1
msg = "> 1 ndim Categorical are not supported at this time"
with pytest.raises(NotImplementedError, match=msg):
Categorical(np.array([list("abcd")]))
def test_validate_ordered(self):
# see gh-14058
exp_msg = "'ordered' must either be 'True' or 'False'"
exp_err = TypeError
# This should be a boolean.
ordered = np.array([0, 1, 2])
with pytest.raises(exp_err, match=exp_msg):
Categorical([1, 2, 3], ordered=ordered)
with pytest.raises(exp_err, match=exp_msg):
Categorical.from_codes(
[0, 0, 1], categories=["a", "b", "c"], ordered=ordered
)
def test_constructor_empty(self):
# GH 17248
c = Categorical([])
expected = Index([])
tm.assert_index_equal(c.categories, expected)
c = Categorical([], categories=[1, 2, 3])
expected = Index([1, 2, 3], dtype=np.int64)
tm.assert_index_equal(c.categories, expected)
def test_constructor_empty_boolean(self):
# see gh-22702
cat = Categorical([], categories=[True, False])
categories = sorted(cat.categories.tolist())
assert categories == [False, True]
def test_constructor_tuples(self):
values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
result = Categorical(values)
expected = Index([(1,), (1, 2)], tupleize_cols=False)
tm.assert_index_equal(result.categories, expected)
assert result.ordered is False
def test_constructor_tuples_datetimes(self):
# numpy will auto reshape when all of the tuples are the
# same len, so add an extra one with 2 items and slice it off
values = np.array(
[
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
(Timestamp("2010-01-01"),),
(Timestamp("2010-01-02"),),
("a", "b"),
],
dtype=object,
)[:-1]
result = Categorical(values)
expected = Index(
[(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
tupleize_cols=False,
)
tm.assert_index_equal(result.categories, expected)
def test_constructor_unsortable(self):
# it works!
arr = np.array([1, 2, 3, datetime.now()], dtype="O")
factor = Categorical(arr, ordered=False)
assert not factor.ordered
# this however will raise as cannot be sorted
msg = (
"'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument."
)
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)
def test_constructor_interval(self):
result = Categorical(
[Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
)
ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
exp = Categorical(ii, ordered=True)
tm.assert_categorical_equal(result, exp)
tm.assert_index_equal(result.categories, ii)
def test_constructor(self):
exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
c1 = Categorical(exp_arr)
tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
c2 = Categorical(exp_arr, categories=["c", "b", "a"])
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
# categories must be unique
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])
# The default should be unordered
c1 = Categorical(["a", "b", "c", "a"])
assert not c1.ordered
# Categorical as input
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1)
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(c1, categories=["a", "b", "c"])
tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
# Series of dtype category
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
c2 = Categorical(Series(c1))
tm.assert_categorical_equal(c1, c2)
# Series
c1 = Categorical(["a", "b", "c", "a"])
c2 = Categorical(Series(["a", "b", "c", "a"]))
tm.assert_categorical_equal(c1, c2)
c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(c1, c2)
# This should result in integer categories, not float!
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
assert is_integer_dtype(cat.categories)
# https://github.com/pandas-dev/pandas/issues/3678
cat = Categorical([np.nan, 1, 2, 3])
assert is_integer_dtype(cat.categories)
# this should result in floats
cat = Categorical([np.nan, 1, 2.0, 3])
assert is_float_dtype(cat.categories)
cat = Categorical([np.nan, 1.0, 2.0, 3.0])
assert is_float_dtype(cat.categories)
# This doesn't work -> this would probably need some kind of "remember
# the original type" feature to try to cast the array interface result
# to...
# vals = np.asarray(cat[cat.notna()])
# assert is_integer_dtype(vals)
# corner cases
cat = Categorical([1])
assert len(cat.categories) == 1
assert cat.categories[0] == 1
assert len(cat.codes) == 1
assert cat.codes[0] == 0
cat = Categorical(["a"])
assert len(cat.categories) == 1
assert cat.categories[0] == "a"
assert len(cat.codes) == 1
assert cat.codes[0] == 0
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
# the next one are from the old docs
with tm.assert_produces_warning(None):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])
# this is a legitimate constructor
with tm.assert_produces_warning(None):
Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)
def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(["a", "b"], categories="a")
def test_constructor_with_null(self):
# Cannot have NaN in categories
msg = "Categorical categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
with pytest.raises(ValueError, match=msg):
Categorical(
DatetimeIndex(["nat", "20160101"]),
categories=[NaT, Timestamp("20160101")],
)
def test_constructor_with_index(self):
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(ci.values, Categorical(ci))
ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
tm.assert_categorical_equal(
ci.values, Categorical(ci.astype(object), categories=ci.categories)
)
def test_constructor_with_generator(self):
# This was raising an Error in isna(single_val).any() because isna
# returned a scalar for a generator
exp = Categorical([0, 1, 2])
cat = Categorical(x for x in [0, 1, 2])
tm.assert_categorical_equal(cat, exp)
cat = Categorical(range(3))
tm.assert_categorical_equal(cat, exp)
MultiIndex.from_product([range(5), ["a", "b", "c"]])
# check that categories accept generators and sequences
cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
tm.assert_categorical_equal(cat, exp)
cat = Categorical([0, 1, 2], categories=range(3))
tm.assert_categorical_equal(cat, exp)
def test_constructor_with_rangeindex(self):
# RangeIndex is preserved in Categories
rng = Index(range(3))
cat = Categorical(rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
cat = Categorical([1, 2, 0], categories=rng)
tm.assert_index_equal(cat.categories, rng, exact=True)
@pytest.mark.parametrize(
"dtl",
[
date_range("1995-01-01 00:00:00", periods=5, freq="s"),
date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
timedelta_range("1 day", periods=5, freq="s"),
],
)
def test_constructor_with_datetimelike(self, dtl):
# see gh-12077
# constructor with a datetimelike and NaT
s = Series(dtl)
c = Categorical(s)
expected = type(dtl)(s)
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
# with NaT
s2 = s.copy()
s2.iloc[-1] = NaT
c = Categorical(s2)
expected = type(dtl)(s2.dropna())
expected._data.freq = None
tm.assert_index_equal(c.categories, expected)
exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
tm.assert_numpy_array_equal(c.codes, exp)
result = repr(c)
assert "NaT" in result
def test_constructor_from_index_series_datetimetz(self):
idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_date_objects(self):
# we dont cast date objects to timestamps, matching Index constructor
v = date.today()
cat = Categorical([v, v])
assert cat.categories.dtype == object
assert type(cat.categories[0]) is date
def test_constructor_from_index_series_timedelta(self):
idx = timedelta_range("1 days", freq="D", periods=3)
idx = idx._with_freq(None) # freq not preserved in result.categories
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
def test_constructor_from_index_series_period(self):
idx = period_range("2015-01-01", freq="D", periods=3)
result = Categorical(idx)
tm.assert_index_equal(result.categories, idx)
result = Categorical(Series(idx))
tm.assert_index_equal(result.categories, idx)
@pytest.mark.parametrize(
"values",
[
np.array([1.0, 1.2, 1.8, np.nan]),
np.array([1, 2, 3], dtype="int64"),
["a", "b", "c", np.nan],
[pd.Period("2014-01"), pd.Period("2014-02"), NaT],
[Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
[
Timestamp("2014-01-01", tz="US/Eastern"),
Timestamp("2014-01-02", tz="US/Eastern"),
NaT,
],
],
)
def test_constructor_invariant(self, values):
# GH 14190
c = Categorical(values)
c2 = Categorical(c)
tm.assert_categorical_equal(c, c2)
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_with_dtype(self, ordered):
categories = ["b", "a", "c"]
dtype = CategoricalDtype(categories, ordered=ordered)
result = Categorical(["a", "b", "a", "c"], dtype=dtype)
expected = Categorical(
["a", "b", "a", "c"], categories=categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
assert result.ordered is ordered
def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(["a", "b"], ordered=True)
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=True, dtype=dtype)
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ordered=False, dtype=dtype)
@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
@pytest.mark.parametrize("ordered", [True, False])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_constructor_str_unknown(self):
with pytest.raises(ValueError, match="Unknown dtype"):
Categorical([1, 2], dtype="foo")
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
def test_constructor_np_strs(self):
# GH#31499 Hashtable.map_locations needs to work on np.str_ objects
cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
assert all(isinstance(x, np.str_) for x in cat.categories)
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_with_unknown_dtype(self):
dtype = CategoricalDtype(None, ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
# We use values.categories, not dtype.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "d"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)
# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)
def test_constructor_with_categorical_categories(self):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
def test_construction_with_null(self, klass, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/31927
values = klass(["a", nulls_fixture, "b"])
result = Categorical(values)
dtype = CategoricalDtype(["a", "b"])
codes = [0, -1, 1]
expected = Categorical.from_codes(codes=codes, dtype=dtype)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype, validate):
# GH#39649
cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
codes = np.random.default_rng(2).integers(5, size=3)
dtype = CategoricalDtype(cats)
arr = Categorical.from_codes(codes, dtype=dtype, validate=validate)
assert arr.categories.dtype == cats.dtype
tm.assert_index_equal(arr.categories, Index(cats))
def test_from_codes_empty(self):
cat = ["a", "b", "c"]
result = Categorical.from_codes([], categories=cat)
expected = Categorical([], categories=cat)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("validate", [True, False])
def test_from_codes_validate(self, validate):
# GH53122
dtype = CategoricalDtype(["a", "b"])
if validate:
with pytest.raises(ValueError, match="codes need to be between "):
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
else:
# passes, though has incorrect codes, but that's the user responsibility
Categorical.from_codes([4, 5], dtype=dtype, validate=validate)
def test_from_codes_too_few_categories(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)
def test_from_codes_non_int_codes(self):
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)
def test_from_codes_non_unique_categories(self):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
def test_from_codes_nan_cat_included(self):
with pytest.raises(ValueError, match="Categorical categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
def test_from_codes_too_negative(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)
def test_from_codes(self):
dtype = CategoricalDtype(categories=["a", "b", "c"])
exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_categorical_categories(self, klass):
# GH17884
expected = Categorical(["a", "b"], categories=["a", "b", "c"])
result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
def test_from_codes_with_non_unique_categorical_categories(self, klass):
with pytest.raises(ValueError, match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
dtype = CategoricalDtype(categories=["a", "b", "c"])
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError, match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)
@pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
def test_from_codes_with_float(self, codes):
# GH21767
# float codes should raise even if values are equal to integers
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, dtype=dtype)
def test_from_codes_with_dtype_raises(self):
msg = "Cannot specify"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(
[0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
)
def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])
def test_from_codes_with_nullable_int(self):
codes = pd.array([0, 1], dtype="Int64")
categories = ["a", "b"]
result = Categorical.from_codes(codes, categories=categories)
expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
tm.assert_categorical_equal(result, expected)
def test_from_codes_with_nullable_int_na_raises(self):
codes = pd.array([0, None], dtype="Int64")
categories = ["a", "b"]
msg = "codes cannot contain NA values"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(codes, categories=categories)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories(self, dtype):
cats = ["a", "b"]
codes = np.array([0, 0, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes(codes, cats)
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("dtype", [None, "category"])
def test_from_inferred_categories_sorts(self, dtype):
cats = ["b", "a"]
codes = np.array([0, 1, 1, 1], dtype="i8")
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)
@pytest.mark.parametrize("ordered", [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)
def test_constructor_imaginary(self):
values = [1, 2, 3 + 1j]
c1 = Categorical(values)
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))
def test_constructor_string_and_tuples(self):
# GH 21416
c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
expected_index = Index([("a", "b"), ("b", "a"), "c"])
assert c.categories.equals(expected_index)
def test_interval(self):
idx = pd.interval_range(0, 10, periods=10)
cat = Categorical(idx, categories=idx)
expected_codes = np.arange(10, dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# infer categories
cat = Categorical(idx)
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values
cat = Categorical(list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# list values, categories
cat = Categorical(list(idx), categories=list(idx))
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# shuffled
values = idx.take([1, 2, 0])
cat = Categorical(values, categories=idx)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
tm.assert_index_equal(cat.categories, idx)
# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
# overlapping
idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
cat = Categorical(idx, categories=idx)
expected_codes = np.array([0, 1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
def test_categorical_extension_array_nullable(self, nulls_fixture):
# GH:
arr = pd.arrays.StringArray._from_sequence(
[nulls_fixture] * 2, dtype=pd.StringDtype()
)
result = Categorical(arr)
assert arr.dtype == result.categories.dtype
expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
tm.assert_categorical_equal(result, expected)
def test_from_sequence_copy(self):
cat = Categorical(np.arange(5).repeat(2))
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=False)
# more generally, we'd be OK with a view
assert result._codes is cat._codes
result = Categorical._from_sequence(cat, dtype=cat.dtype, copy=True)
assert not tm.shares_memory(result, cat)
def test_constructor_datetime64_non_nano(self):
categories = np.arange(10).view("M8[D]")
values = categories[::2].copy()
cat = Categorical(values, categories=categories)
assert (cat == values).all()
def test_constructor_preserves_freq(self):
# GH33830 freq retention in categorical
dti = date_range("2016-01-01", periods=5)
expected = dti.freq
cat = Categorical(dti)
result = cat.categories.freq
assert expected == result

View File

@ -0,0 +1,139 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas import (
Categorical,
CategoricalIndex,
Index,
IntervalIndex,
Series,
Timestamp,
)
import pandas._testing as tm
class TestCategoricalDtypes:
def test_categories_match_up_to_permutation(self):
# test dtype comparisons between cats
c1 = Categorical(list("aabca"), categories=list("abc"), ordered=False)
c2 = Categorical(list("aabca"), categories=list("cab"), ordered=False)
c3 = Categorical(list("aabca"), categories=list("cab"), ordered=True)
assert c1._categories_match_up_to_permutation(c1)
assert c2._categories_match_up_to_permutation(c2)
assert c3._categories_match_up_to_permutation(c3)
assert c1._categories_match_up_to_permutation(c2)
assert not c1._categories_match_up_to_permutation(c3)
assert not c1._categories_match_up_to_permutation(Index(list("aabca")))
assert not c1._categories_match_up_to_permutation(c1.astype(object))
assert c1._categories_match_up_to_permutation(CategoricalIndex(c1))
assert c1._categories_match_up_to_permutation(
CategoricalIndex(c1, categories=list("cab"))
)
assert not c1._categories_match_up_to_permutation(
CategoricalIndex(c1, ordered=True)
)
# GH 16659
s1 = Series(c1)
s2 = Series(c2)
s3 = Series(c3)
assert c1._categories_match_up_to_permutation(s1)
assert c2._categories_match_up_to_permutation(s2)
assert c3._categories_match_up_to_permutation(s3)
assert c1._categories_match_up_to_permutation(s2)
assert not c1._categories_match_up_to_permutation(s3)
assert not c1._categories_match_up_to_permutation(s1.astype(object))
def test_set_dtype_same(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(["a", "b", "c"]))
tm.assert_categorical_equal(result, c)
def test_set_dtype_new_categories(self):
c = Categorical(["a", "b", "c"])
result = c._set_dtype(CategoricalDtype(list("abcd")))
tm.assert_numpy_array_equal(result.codes, c.codes)
tm.assert_index_equal(result.dtype.categories, Index(list("abcd")))
@pytest.mark.parametrize(
"values, categories, new_categories",
[
# No NaNs, same cats, same order
(["a", "b", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["a", "b", "a"], ["a", "b"], ["b", "a"]),
# Same, unsorted
(["b", "a", "a"], ["a", "b"], ["a", "b"]),
# No NaNs, same cats, different order
(["b", "a", "a"], ["a", "b"], ["b", "a"]),
# NaNs
(["a", "b", "c"], ["a", "b"], ["a", "b"]),
(["a", "b", "c"], ["a", "b"], ["b", "a"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
(["b", "a", "c"], ["a", "b"], ["a", "b"]),
# Introduce NaNs
(["a", "b", "c"], ["a", "b"], ["a"]),
(["a", "b", "c"], ["a", "b"], ["b"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
(["b", "a", "c"], ["a", "b"], ["a"]),
# No overlap
(["a", "b", "c"], ["a", "b"], ["d", "e"]),
],
)
@pytest.mark.parametrize("ordered", [True, False])
def test_set_dtype_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c._set_dtype(expected.dtype)
tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self):
c = Categorical(["a", "b", "c"], ["d", "e"])
result = c._set_dtype(CategoricalDtype(["a", "b"]))
expected = Categorical([None, None, None], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_codes_dtypes(self):
# GH 8453
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = Categorical([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
result = Categorical([f"foo{i:05d}" for i in range(40000)])
assert result.codes.dtype == "int32"
# adding cats
result = Categorical(["foo", "bar", "baz"])
assert result.codes.dtype == "int8"
result = result.add_categories([f"foo{i:05d}" for i in range(400)])
assert result.codes.dtype == "int16"
# removing cats
result = result.remove_categories([f"foo{i:05d}" for i in range(300)])
assert result.codes.dtype == "int8"
def test_iter_python_types(self):
# GH-19909
cat = Categorical([1, 2])
assert isinstance(next(iter(cat)), int)
assert isinstance(cat.tolist()[0], int)
def test_iter_python_types_datetime(self):
cat = Categorical([Timestamp("2017-01-01"), Timestamp("2017-01-02")])
assert isinstance(next(iter(cat)), Timestamp)
assert isinstance(cat.tolist()[0], Timestamp)
def test_interval_index_category(self):
# GH 38316
index = IntervalIndex.from_breaks(np.arange(3, dtype="uint64"))
result = CategoricalIndex(index).dtype.categories
expected = IntervalIndex.from_arrays(
[0, 1], [1, 2], dtype="interval[uint64, right]"
)
tm.assert_index_equal(result, expected)

View File

@ -0,0 +1,388 @@
import math
import numpy as np
import pytest
from pandas import (
NA,
Categorical,
CategoricalIndex,
Index,
Interval,
IntervalIndex,
NaT,
PeriodIndex,
Series,
Timedelta,
Timestamp,
)
import pandas._testing as tm
import pandas.core.common as com
class TestCategoricalIndexingWithFactor:
def test_getitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
assert factor[0] == "a"
assert factor[-1] == "c"
subf = factor[[0, 1, 2]]
tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
subf = factor[np.asarray(factor) == "c"]
tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
def test_setitem(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
# int/positional
c = factor.copy()
c[0] = "b"
assert c[0] == "b"
c[-1] = "a"
assert c[-1] == "a"
# boolean
c = factor.copy()
indexer = np.zeros(len(c), dtype="bool")
indexer[0] = True
indexer[-1] = True
c[indexer] = "c"
expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(c, expected)
@pytest.mark.parametrize(
"other",
[Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])],
)
def test_setitem_same_but_unordered(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
target[mask] = other[mask]
expected = Categorical(["b", "b"], categories=["a", "b"])
tm.assert_categorical_equal(target, expected)
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"], categories=["b", "a", "c"]),
Categorical(["b", "a"], categories=["a", "b", "c"]),
Categorical(["a", "a"], categories=["a"]),
Categorical(["b", "b"], categories=["b"]),
],
)
def test_setitem_different_unordered_raises(self, other):
# GH-24142
target = Categorical(["a", "b"], categories=["a", "b"])
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
@pytest.mark.parametrize(
"other",
[
Categorical(["b", "a"]),
Categorical(["b", "a"], categories=["b", "a"], ordered=True),
Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
],
)
def test_setitem_same_ordered_raises(self, other):
# Gh-24142
target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
mask = np.array([True, False])
msg = "Cannot set a Categorical with another, without identical categories"
with pytest.raises(TypeError, match=msg):
target[mask] = other[mask]
def test_setitem_tuple(self):
# GH#20439
cat = Categorical([(0, 1), (0, 2), (0, 1)])
# This should not raise
cat[1] = cat[0]
assert cat[1] == (0, 1)
def test_setitem_listlike(self):
# GH#9469
# properly coerce the input indexers
cat = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
).add_categories([-1000])
indexer = np.array([100000]).astype(np.int64)
cat[indexer] = -1000
# we are asserting the code result here
# which maps to the -1000 category
result = cat.codes[np.array([100000]).astype(np.int64)]
tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))
class TestCategoricalIndexing:
def test_getitem_slice(self):
cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
sliced = cat[3]
assert sliced == "d"
sliced = cat[3:5]
expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
tm.assert_categorical_equal(sliced, expected)
def test_getitem_listlike(self):
# GH 9469
# properly coerce the input indexers
c = Categorical(
np.random.default_rng(2).integers(0, 5, size=150000).astype(np.int8)
)
result = c.codes[np.array([100000]).astype(np.int64)]
expected = c[np.array([100000]).astype(np.int64)].codes
tm.assert_numpy_array_equal(result, expected)
def test_periodindex(self):
idx1 = PeriodIndex(
["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"],
freq="M",
)
cat1 = Categorical(idx1)
str(cat1)
exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat1._codes, exp_arr)
tm.assert_index_equal(cat1.categories, exp_idx)
idx2 = PeriodIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"],
freq="M",
)
cat2 = Categorical(idx2, ordered=True)
str(cat2)
exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
tm.assert_numpy_array_equal(cat2._codes, exp_arr)
tm.assert_index_equal(cat2.categories, exp_idx2)
idx3 = PeriodIndex(
[
"2013-12",
"2013-11",
"2013-10",
"2013-09",
"2013-08",
"2013-07",
"2013-05",
],
freq="M",
)
cat3 = Categorical(idx3, ordered=True)
exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
exp_idx = PeriodIndex(
[
"2013-05",
"2013-07",
"2013-08",
"2013-09",
"2013-10",
"2013-11",
"2013-12",
],
freq="M",
)
tm.assert_numpy_array_equal(cat3._codes, exp_arr)
tm.assert_index_equal(cat3.categories, exp_idx)
@pytest.mark.parametrize(
"null_val",
[None, np.nan, NaT, NA, math.nan, "NaT", "nat", "NAT", "nan", "NaN", "NAN"],
)
def test_periodindex_on_null_types(self, null_val):
# GH 46673
result = PeriodIndex(["2022-04-06", "2022-04-07", null_val], freq="D")
expected = PeriodIndex(["2022-04-06", "2022-04-07", "NaT"], dtype="period[D]")
assert result[2] is NaT
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
def test_categories_assignments_wrong_length_raises(self, new_categories):
cat = Categorical(["a", "b", "c", "a"])
msg = (
"new categories need to have the same number of items "
"as the old categories!"
)
with pytest.raises(ValueError, match=msg):
cat.rename_categories(new_categories)
# Combinations of sorted/unique:
@pytest.mark.parametrize(
"idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
)
# Combinations of missing/unique
@pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
@pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
@pytest.mark.parametrize("dtype", [None, "category", "key"])
def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
if dtype == "key":
dtype = key.dtype
# Test for flat index and CategoricalIndex with same/different cats:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
tm.assert_numpy_array_equal(expected, result)
tm.assert_numpy_array_equal(exp_miss, res_miss)
exp_unique = idx.unique().get_indexer(key_values)
res_unique = idx.unique().get_indexer(key)
tm.assert_numpy_array_equal(res_unique, exp_unique)
def test_where_unobserved_nan(self):
ser = Series(Categorical(["a", "b"]))
result = ser.where([True, False])
expected = Series(Categorical(["a", None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
# all NA
ser = Series(Categorical(["a", "b"]))
result = ser.where([False, False])
expected = Series(Categorical([None, None], categories=["a", "b"]))
tm.assert_series_equal(result, expected)
def test_where_unobserved_categories(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
result = ser.where([True, True, False], other="b")
expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories))
tm.assert_series_equal(result, expected)
def test_where_other_categorical(self):
ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
result = ser.where([True, False, True], other)
expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
tm.assert_series_equal(result, expected)
def test_where_new_category_raises(self):
ser = Series(Categorical(["a", "b", "c"]))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
ser.where([True, False, True], "d")
def test_where_ordered_differs_rasies(self):
ser = Series(
Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
)
other = Categorical(
["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
)
with pytest.raises(TypeError, match="without identical categories"):
ser.where([True, False, True], other)
class TestContains:
def test_contains(self):
# GH#21508
cat = Categorical(list("aabbca"), categories=list("cab"))
assert "b" in cat
assert "z" not in cat
assert np.nan not in cat
with pytest.raises(TypeError, match="unhashable type: 'list'"):
assert [1] in cat
# assert codes NOT in index
assert 0 not in cat
assert 1 not in cat
cat = Categorical(list("aabbca") + [np.nan], categories=list("cab"))
assert np.nan in cat
@pytest.mark.parametrize(
"item, expected",
[
(Interval(0, 1), True),
(1.5, True),
(Interval(0.5, 1.5), False),
("a", False),
(Timestamp(1), False),
(Timedelta(1), False),
],
ids=str,
)
def test_contains_interval(self, item, expected):
# GH#23705
cat = Categorical(IntervalIndex.from_breaks(range(3)))
result = item in cat
assert result is expected
def test_contains_list(self):
# GH#21729
cat = Categorical([1, 2, 3])
assert "a" not in cat
with pytest.raises(TypeError, match="unhashable type"):
["a"] in cat
with pytest.raises(TypeError, match="unhashable type"):
["a", "b"] in cat
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean(index):
ser = Series(range(3))
idx = Categorical([True, False, True])
if index:
idx = CategoricalIndex(idx)
assert com.is_bool_indexer(idx)
result = ser[idx]
expected = ser[idx.astype("object")]
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_na_treated_as_false(index):
# https://github.com/pandas-dev/pandas/issues/31503
ser = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)
result = ser[idx]
expected = ser[idx.fillna(False)]
tm.assert_series_equal(result, expected)
@pytest.fixture
def non_coercible_categorical(monkeypatch):
"""
Monkeypatch Categorical.__array__ to ensure no implicit conversion.
Raises
------
ValueError
When Categorical.__array__ is called.
"""
# TODO(Categorical): identify other places where this may be
# useful and move to a conftest.py
def array(self, dtype=None):
raise ValueError("I cannot be converted.")
with monkeypatch.context() as m:
m.setattr(Categorical, "__array__", array)
yield
def test_series_at():
arr = Categorical(["a", "b", "c"])
ser = Series(arr)
result = ser.at[0]
assert result == "a"

View File

@ -0,0 +1,154 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
Index,
Series,
)
import pandas._testing as tm
@pytest.fixture(params=[None, "ignore"])
def na_action(request):
return request.param
@pytest.mark.parametrize(
"data, categories",
[
(list("abcbca"), list("cab")),
(pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
],
ids=["string", "interval"],
)
def test_map_str(data, categories, ordered, na_action):
# GH 31202 - override base class since we want to maintain categorical/ordered
cat = Categorical(data, categories=categories, ordered=ordered)
result = cat.map(str, na_action=na_action)
expected = Categorical(
map(str, data), categories=map(str, categories), ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_map(na_action):
cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
tm.assert_categorical_equal(result, exp)
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
result = cat.map(lambda x: x.lower(), na_action=na_action)
exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
tm.assert_categorical_equal(result, exp)
# GH 12766: Return an index not an array
result = cat.map(lambda x: 1, na_action=na_action)
exp = Index(np.array([1] * 5, dtype=np.int64))
tm.assert_index_equal(result, exp)
# change categories dtype
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
def f(x):
return {"A": 10, "B": 20, "C": 30}.get(x)
result = cat.map(f, na_action=na_action)
exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
tm.assert_categorical_equal(result, exp)
mapper = Series([10, 20, 30], index=["A", "B", "C"])
result = cat.map(mapper, na_action=na_action)
tm.assert_categorical_equal(result, exp)
result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action)
tm.assert_categorical_equal(result, exp)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Index([False, False, True])),
([1, 2, np.nan], pd.isna, Index([False, False, True])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False] * 3),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_none(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action=None)
if isinstance(expected, Categorical):
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize(
("data", "f", "expected"),
(
([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])),
([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
(
[1, 1, np.nan],
Series([False, False]),
Categorical([False, False, np.nan]),
),
(
[1, 2, np.nan],
Series([False, False, False]),
Index([False, False, np.nan]),
),
),
)
def test_map_with_nan_ignore(data, f, expected): # GH 24241
values = Categorical(data)
result = values.map(f, na_action="ignore")
if data[1] == 1:
tm.assert_categorical_equal(result, expected)
else:
tm.assert_index_equal(result, expected)
def test_map_with_dict_or_series(na_action):
orig_values = ["a", "B", 1, "a"]
new_values = ["one", 2, 3.0, "one"]
cat = Categorical(orig_values)
mapper = Series(new_values[:-1], index=orig_values[:-1])
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
expected = Categorical(new_values, categories=[3.0, 2, "one"])
tm.assert_categorical_equal(result, expected)
mapper = dict(zip(orig_values[:-1], new_values[:-1]))
result = cat.map(mapper, na_action=na_action)
# Order of categories in result can be different
tm.assert_categorical_equal(result, expected)
def test_map_na_action_no_default_deprecated():
# GH51645
cat = Categorical(["a", "b", "c"])
msg = (
"The default value of 'ignore' for the `na_action` parameter in "
"pandas.Categorical.map is deprecated and will be "
"changed to 'None' in a future version. Please set na_action to the "
"desired value to avoid seeing this warning"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
cat.map(lambda x: x)

View File

@ -0,0 +1,216 @@
import collections
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import CategoricalDtype
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
Series,
isna,
)
import pandas._testing as tm
class TestCategoricalMissing:
def test_isna(self):
exp = np.array([False, False, True])
cat = Categorical(["a", "b", np.nan])
res = cat.isna()
tm.assert_numpy_array_equal(res, exp)
def test_na_flags_int_categories(self):
# #1457
categories = list(range(10))
labels = np.random.default_rng(2).integers(0, 10, 20)
labels[::5] = -1
cat = Categorical(labels, categories)
repr(cat)
tm.assert_numpy_array_equal(isna(cat), labels == -1)
def test_nan_handling(self):
# Nans are represented as -1 in codes
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
c[1] = np.nan
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
# Adding nan to categories should make assigned nan point to the
# category!
c = Categorical(["a", "b", np.nan, "a"])
tm.assert_index_equal(c.categories, Index(["a", "b"]))
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
def test_set_dtype_nans(self):
c = Categorical(["a", "b", np.nan])
result = c._set_dtype(CategoricalDtype(["a", "c"]))
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_set_item_nan(self):
cat = Categorical([1, 2, 3])
cat[1] = np.nan
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
tm.assert_categorical_equal(cat, exp)
@pytest.mark.parametrize(
"fillna_kwargs, msg",
[
(
{"value": 1, "method": "ffill"},
"Cannot specify both 'value' and 'method'.",
),
({}, "Must specify a fill 'value' or 'method'."),
({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
(
{"value": Series([1, 2, 3, 4, "a"])},
"Cannot setitem on a Categorical with a new category",
),
],
)
def test_fillna_raises(self, fillna_kwargs, msg):
# https://github.com/pandas-dev/pandas/issues/19682
# https://github.com/pandas-dev/pandas/issues/13628
cat = Categorical([1, 2, 3, None, None])
if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
err = TypeError
else:
err = ValueError
with pytest.raises(err, match=msg):
cat.fillna(**fillna_kwargs)
@pytest.mark.parametrize("named", [True, False])
def test_fillna_iterable_category(self, named):
# https://github.com/pandas-dev/pandas/issues/21097
if named:
Point = collections.namedtuple("Point", "x y")
else:
Point = lambda *args: args # tuple
cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
result = cat.fillna(Point(0, 0))
expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
tm.assert_categorical_equal(result, expected)
# Case where the Point is not among our categories; we want ValueError,
# not NotImplementedError GH#41914
cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
msg = "Cannot setitem on a Categorical with a new category"
with pytest.raises(TypeError, match=msg):
cat.fillna(Point(0, 0))
def test_fillna_array(self):
# accept Categorical or ndarray value if it holds appropriate values
cat = Categorical(["A", "B", "C", None, None])
other = cat.fillna("C")
result = cat.fillna(other)
tm.assert_categorical_equal(result, other)
assert isna(cat[-1]) # didn't modify original inplace
other = np.array(["A", "B", "C", "B", "A"])
result = cat.fillna(other)
expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
tm.assert_categorical_equal(result, expected)
assert isna(cat[-1]) # didn't modify original inplace
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
cat = Categorical(values)
result = cat.isna()
tm.assert_numpy_array_equal(result, expected)
result = Series(cat).isna()
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = DataFrame(cat).isna()
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values, expected",
[
([1, 2, 3], np.array([False, False, False])),
([1, 2, np.nan], np.array([False, False, True])),
([1, 2, np.inf], np.array([False, False, True])),
([1, 2, pd.NA], np.array([False, False, True])),
],
)
def test_use_inf_as_na_outside_context(self, values, expected):
# https://github.com/pandas-dev/pandas/issues/33594
# Using isna directly for Categorical will fail in general here
cat = Categorical(values)
msg = "use_inf_as_na option is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pd.option_context("mode.use_inf_as_na", True):
result = isna(cat)
tm.assert_numpy_array_equal(result, expected)
result = isna(Series(cat))
expected = Series(expected)
tm.assert_series_equal(result, expected)
result = isna(DataFrame(cat))
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"a1, a2, categories",
[
(["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
],
)
def test_compare_categorical_with_missing(self, a1, a2, categories):
# GH 28384
cat_type = CategoricalDtype(categories)
# !=
result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
expected = Series(a1) != Series(a2)
tm.assert_series_equal(result, expected)
# ==
result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
expected = Series(a1) == Series(a2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"na_value, dtype",
[
(pd.NaT, "datetime64[ns]"),
(None, "float64"),
(np.nan, "float64"),
(pd.NA, "float64"),
],
)
def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
# GH#44900
result = Categorical([na_value, na_value])
tm.assert_index_equal(result.categories, Index([], dtype=dtype))

View File

@ -0,0 +1,414 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCategoricalOpsWithFactor:
def test_categories_none_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
tm.assert_categorical_equal(factor, factor)
def test_comparisons(self):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
result = factor[factor == "a"]
expected = factor[np.asarray(factor) == "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor != "a"]
expected = factor[np.asarray(factor) != "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor < "c"]
expected = factor[np.asarray(factor) < "c"]
tm.assert_categorical_equal(result, expected)
result = factor[factor > "a"]
expected = factor[np.asarray(factor) > "a"]
tm.assert_categorical_equal(result, expected)
result = factor[factor >= "b"]
expected = factor[np.asarray(factor) >= "b"]
tm.assert_categorical_equal(result, expected)
result = factor[factor <= "b"]
expected = factor[np.asarray(factor) <= "b"]
tm.assert_categorical_equal(result, expected)
n = len(factor)
other = factor[np.random.default_rng(2).permutation(n)]
result = factor == other
expected = np.asarray(factor) == np.asarray(other)
tm.assert_numpy_array_equal(result, expected)
result = factor == "d"
expected = np.zeros(len(factor), dtype=bool)
tm.assert_numpy_array_equal(result, expected)
# comparisons with categoricals
cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
cat_rev_base = Categorical(
["b", "b", "b"], categories=["c", "b", "a"], ordered=True
)
cat = Categorical(["a", "b", "c"], ordered=True)
cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = np.array([True, False, False])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = np.array([False, False, True])
tm.assert_numpy_array_equal(res_rev, exp_rev)
res = cat > cat_base
exp = np.array([False, False, True])
tm.assert_numpy_array_equal(res, exp)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
with pytest.raises(TypeError, match=msg):
cat_rev > cat_rev_base2
# Only categories with same ordering information can be compared
cat_unordered = cat.set_ordered(False)
assert not (cat > cat).any()
with pytest.raises(TypeError, match=msg):
cat > cat_unordered
# comparison (in both directions) with Series will raise
s = Series(["b", "b", "b"], dtype=object)
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
# comparison with numpy.array will raise in both direction, but only on
# newer numpy versions
a = np.array(["b", "b", "b"], dtype=object)
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
# Make sure that unequal comparison take the categories order in
# account
cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True)
exp = np.array([True, False, False])
res = cat_rev > "b"
tm.assert_numpy_array_equal(res, exp)
# check that zero-dim array gets unboxed
res = cat_rev > np.array("b")
tm.assert_numpy_array_equal(res, exp)
class TestCategoricalOps:
@pytest.mark.parametrize(
"categories",
[["a", "b"], [0, 1], [Timestamp("2019"), Timestamp("2020")]],
)
def test_not_equal_with_na(self, categories):
# https://github.com/pandas-dev/pandas/issues/32276
c1 = Categorical.from_codes([-1, 0], categories=categories)
c2 = Categorical.from_codes([0, 1], categories=categories)
result = c1 != c2
assert result.all()
def test_compare_frame(self):
# GH#24282 check that Categorical.__cmp__(DataFrame) defers to frame
data = ["a", "b", 2, "a"]
cat = Categorical(data)
df = DataFrame(cat)
result = cat == df.T
expected = DataFrame([[True, True, True, True]])
tm.assert_frame_equal(result, expected)
result = cat[::-1] != df.T
expected = DataFrame([[False, True, True, False]])
tm.assert_frame_equal(result, expected)
def test_compare_frame_raises(self, comparison_op):
# alignment raises unless we transpose
op = comparison_op
cat = Categorical(["a", "b", 2, "a"])
df = DataFrame(cat)
msg = "Unable to coerce to Series, length must be 1: given 4"
with pytest.raises(ValueError, match=msg):
op(cat, df)
def test_datetime_categorical_comparison(self):
dt_cat = Categorical(date_range("2014-01-01", periods=3), ordered=True)
tm.assert_numpy_array_equal(dt_cat > dt_cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat, np.array([False, True, True]))
def test_reflected_comparison_with_scalars(self):
# GH8658
cat = Categorical([1, 2, 3], ordered=True)
tm.assert_numpy_array_equal(cat > cat[0], np.array([False, True, True]))
tm.assert_numpy_array_equal(cat[0] < cat, np.array([False, True, True]))
def test_comparison_with_unknown_scalars(self):
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
# and following comparisons with scalars not in categories should raise
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
msg = "Invalid comparison between dtype=category and int"
with pytest.raises(TypeError, match=msg):
cat < 4
with pytest.raises(TypeError, match=msg):
cat > 4
with pytest.raises(TypeError, match=msg):
4 < cat
with pytest.raises(TypeError, match=msg):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True]))
def test_comparison_with_tuple(self):
cat = Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object))
result = cat == "foo"
expected = np.array([True, False, False, False], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat == (0, 1)
expected = np.array([False, True, False, True], dtype=bool)
tm.assert_numpy_array_equal(result, expected)
result = cat != (0, 1)
tm.assert_numpy_array_equal(result, ~expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_scalar(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# BUG: fix ordered categorical comparison with missing values (#26504 )
# and following comparisons with scalars in categories with missing
# values should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
scalar = 2
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(scalar)
actual = getattr(cat, compare_operators_no_eq_ne)(scalar)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
def test_comparison_of_ordered_categorical_with_nan_to_listlike(
self, compare_operators_no_eq_ne
):
# https://github.com/pandas-dev/pandas/issues/26504
# and following comparisons of missing values in ordered Categorical
# with listlike should be evaluated as False
cat = Categorical([1, 2, 3, None], categories=[1, 2, 3], ordered=True)
other = Categorical([2, 2, 2, 2], categories=[1, 2, 3], ordered=True)
expected = getattr(np.array(cat), compare_operators_no_eq_ne)(2)
actual = getattr(cat, compare_operators_no_eq_ne)(other)
tm.assert_numpy_array_equal(actual, expected)
@pytest.mark.parametrize(
"data,reverse,base",
[(list("abc"), list("cba"), list("bbb")), ([1, 2, 3], [3, 2, 1], [2, 2, 2])],
)
def test_comparisons(self, data, reverse, base):
cat_rev = Series(Categorical(data, categories=reverse, ordered=True))
cat_rev_base = Series(Categorical(base, categories=reverse, ordered=True))
cat = Series(Categorical(data, ordered=True))
cat_base = Series(
Categorical(base, categories=cat.cat.categories, ordered=True)
)
s = Series(base, dtype=object if base == list("bbb") else None)
a = np.array(base)
# comparisons need to take categories ordering into account
res_rev = cat_rev > cat_rev_base
exp_rev = Series([True, False, False])
tm.assert_series_equal(res_rev, exp_rev)
res_rev = cat_rev < cat_rev_base
exp_rev = Series([False, False, True])
tm.assert_series_equal(res_rev, exp_rev)
res = cat > cat_base
exp = Series([False, False, True])
tm.assert_series_equal(res, exp)
scalar = base[1]
res = cat > scalar
exp = Series([False, False, True])
exp2 = cat.values > scalar
tm.assert_series_equal(res, exp)
tm.assert_numpy_array_equal(res.values, exp2)
res_rev = cat_rev > scalar
exp_rev = Series([True, False, False])
exp_rev2 = cat_rev.values > scalar
tm.assert_series_equal(res_rev, exp_rev)
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
# Only categories with same categories can be compared
msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
cat > cat_rev
# categorical cannot be compared to Series or numpy array, and also
# not the other way around
msg = (
"Cannot compare a Categorical for op __gt__ with type "
r"<class 'numpy\.ndarray'>"
)
with pytest.raises(TypeError, match=msg):
cat > s
with pytest.raises(TypeError, match=msg):
cat_rev > s
with pytest.raises(TypeError, match=msg):
cat > a
with pytest.raises(TypeError, match=msg):
cat_rev > a
with pytest.raises(TypeError, match=msg):
s < cat
with pytest.raises(TypeError, match=msg):
s < cat_rev
with pytest.raises(TypeError, match=msg):
a < cat
with pytest.raises(TypeError, match=msg):
a < cat_rev
@pytest.mark.parametrize(
"ctor",
[
lambda *args, **kwargs: Categorical(*args, **kwargs),
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
],
)
def test_unordered_different_order_equal(self, ctor):
# https://github.com/pandas-dev/pandas/issues/16014
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
assert (c1 == c2).all()
c1 = ctor(["a", "b"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "a"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["b", "b"], categories=["b", "a"], ordered=False)
assert (c1 != c2).all()
c1 = ctor(["a", "a"], categories=["a", "b"], ordered=False)
c2 = ctor(["a", "b"], categories=["b", "a"], ordered=False)
result = c1 == c2
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
def test_unordered_different_categories_raises(self):
c1 = Categorical(["a", "b"], categories=["a", "b"], ordered=False)
c2 = Categorical(["a", "c"], categories=["c", "a"], ordered=False)
with pytest.raises(TypeError, match=("Categoricals can only be compared")):
c1 == c2
def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
msg = "Categoricals can only be compared if 'categories' are the same."
with pytest.raises(TypeError, match=msg):
c1 == c2
def test_compare_unordered_different_order(self):
# https://github.com/pandas-dev/pandas/issues/16603#issuecomment-
# 349290078
a = Categorical(["a"], categories=["a", "b"])
b = Categorical(["b"], categories=["b", "a"])
assert not a.equals(b)
def test_numeric_like_ops(self):
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=["value"], ascending=True)
df["value_group"] = pd.cut(
df.value, range(0, 10500, 500), right=False, labels=cat_labels
)
# numeric ops should not succeed
for op, str_rep in [
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
]:
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(df, op)(df)
# reduction ops should not succeed (unless specifically defined, e.g.
# min/max)
s = df["value_group"]
for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]:
msg = f"does not support reduction '{op}'"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(numeric_only=False)
def test_numeric_like_ops_series(self):
# numpy ops
s = Series(Categorical([1, 2, 3, 4]))
with pytest.raises(TypeError, match="does not support reduction 'sum'"):
np.sum(s)
@pytest.mark.parametrize(
"op, str_rep",
[
("__add__", r"\+"),
("__sub__", "-"),
("__mul__", r"\*"),
("__truediv__", "/"),
],
)
def test_numeric_like_ops_series_arith(self, op, str_rep):
# numeric ops on a Series
s = Series(Categorical([1, 2, 3, 4]))
msg = f"Series cannot perform the operation {str_rep}|unsupported operand"
with pytest.raises(TypeError, match=msg):
getattr(s, op)(2)
def test_numeric_like_ops_series_invalid(self):
# invalid ufunc
s = Series(Categorical([1, 2, 3, 4]))
msg = "Object with dtype category cannot perform the numpy op log"
with pytest.raises(TypeError, match=msg):
np.log(s)

View File

@ -0,0 +1,111 @@
import pytest
import pandas as pd
from pandas import Categorical
import pandas._testing as tm
@pytest.mark.parametrize(
"to_replace,value,expected,flip_categories",
[
# one-to-one
(1, 2, [2, 2, 3], False),
(1, 4, [4, 2, 3], False),
(4, 1, [1, 2, 3], False),
(5, 6, [1, 2, 3], False),
# many-to-one
([1], 2, [2, 2, 3], False),
([1, 2], 3, [3, 3, 3], False),
([1, 2], 4, [4, 4, 3], False),
((1, 2, 4), 5, [5, 5, 3], False),
((5, 6), 2, [1, 2, 3], False),
([1], [2], [2, 2, 3], False),
([1, 4], [5, 2], [5, 2, 3], False),
# GH49404: overlap between to_replace and value
([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
# GH50872, GH46884: replace with null
(1, None, [None, 2, 3], False),
(1, pd.NA, [None, 2, 3], False),
# check_categorical sorts categories, which crashes on mixed dtypes
(3, "4", [1, 2, "4"], False),
([1, 2, "3"], "5", ["5", "5", 3], True),
],
)
@pytest.mark.filterwarnings(
"ignore:.*with CategoricalDtype is deprecated:FutureWarning"
)
def test_replace_categorical_series(to_replace, value, expected, flip_categories):
# GH 31720
ser = pd.Series([1, 2, 3], dtype="category")
result = ser.replace(to_replace, value)
expected = pd.Series(expected, dtype="category")
ser.replace(to_replace, value, inplace=True)
if flip_categories:
expected = expected.cat.set_categories(expected.cat.categories[::-1])
tm.assert_series_equal(expected, result, check_category_order=False)
tm.assert_series_equal(expected, ser, check_category_order=False)
@pytest.mark.parametrize(
"to_replace, value, result, expected_error_msg",
[
("b", "c", ["a", "c"], "Categorical.categories are different"),
("c", "d", ["a", "b"], None),
# https://github.com/pandas-dev/pandas/issues/33288
("a", "a", ["a", "b"], None),
("b", None, ["a", None], "Categorical.categories length are different"),
],
)
def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
warn = FutureWarning if expected_error_msg is not None else None
with tm.assert_produces_warning(warn, match=msg):
result = pd.Series(cat, copy=False).replace(to_replace, value)._values
tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)
ser = pd.Series(cat, copy=False)
with tm.assert_produces_warning(warn, match=msg):
ser.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)
def test_replace_categorical_ea_dtype():
# GH49404
cat = Categorical(pd.array(["a", "b"], dtype="string"))
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
tm.assert_categorical_equal(result, expected)
def test_replace_maintain_ordering():
# GH51016
dtype = pd.CategoricalDtype([0, 1, 2], ordered=True)
ser = pd.Series([0, 1, 2], dtype=dtype)
msg = (
r"The behavior of Series\.replace \(and DataFrame.replace\) "
"with CategoricalDtype"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = ser.replace(0, 2)
expected_dtype = pd.CategoricalDtype([1, 2], ordered=True)
expected = pd.Series([2, 1, 2], dtype=expected_dtype)
tm.assert_series_equal(expected, result, check_category_order=True)

View File

@ -0,0 +1,550 @@
import numpy as np
import pytest
from pandas._config import using_pyarrow_string_dtype
from pandas import (
Categorical,
CategoricalDtype,
CategoricalIndex,
Index,
Series,
date_range,
option_context,
period_range,
timedelta_range,
)
class TestCategoricalReprWithFactor:
def test_print(self, using_infer_string):
factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
if using_infer_string:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, string): [a < b < c]",
]
else:
expected = [
"['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
"Categories (3, object): ['a' < 'b' < 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
class TestCategoricalRepr:
def test_big_print(self):
codes = np.array([0, 1, 2, 0, 1, 2] * 100)
dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
factor = Categorical.from_codes(codes, dtype=dtype)
expected = [
"['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
"Length: 600",
"Categories (3, object): ['a', 'b', 'c']",
]
expected = "\n".join(expected)
actual = repr(factor)
assert actual == expected
def test_empty_print(self):
factor = Categorical([], Index(["a", "b", "c"], dtype=object))
expected = "[], Categories (3, object): ['a', 'b', 'c']"
actual = repr(factor)
assert actual == expected
assert expected == actual
factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
actual = repr(factor)
assert expected == actual
factor = Categorical([], [])
expected = "[], Categories (0, object): []"
assert expected == repr(factor)
def test_print_none_width(self):
# GH10087
a = Series(Categorical([1, 2, 3, 4]))
exp = (
"0 1\n1 2\n2 3\n3 4\n"
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
)
with option_context("display.width", None):
assert exp == repr(a)
@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Change once infer_string is set to True by default",
)
def test_unicode_print(self):
c = Categorical(["aaaaa", "bb", "cccc"] * 20)
expected = """\
['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
Length: 60
Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
assert repr(c) == expected
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """\
['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
# unicode option should not affect to Categorical, as it doesn't care
# the repr width
with option_context("display.unicode.east_asian_width", True):
c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう']
Length: 60
Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501
assert repr(c) == expected
def test_categorical_repr(self):
c = Categorical([1, 2, 3])
exp = """[1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1, 2, 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1, 2, 3, 4, 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64))
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]"""
assert repr(c) == exp
def test_categorical_repr_ordered(self):
c = Categorical([1, 2, 3], ordered=True)
exp = """[1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
exp = """[1, 2, 3, 1, 2, 3]
Categories (3, int64): [1 < 2 < 3]"""
assert repr(c) == exp
c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True)
exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5]
Length: 50
Categories (5, int64): [1 < 2 < 3 < 4 < 5]"""
assert repr(c) == exp
c = Categorical(np.arange(20, dtype=np.int64), ordered=True)
exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19]
Length: 20
Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]"""
assert repr(c) == exp
def test_categorical_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
""
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, "
"2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]\n"
"Categories (5, datetime64[ns]): [2011-01-01 09:00:00, "
"2011-01-01 10:00:00, 2011-01-01 11:00:00,\n"
" 2011-01-01 12:00:00, "
"2011-01-01 13:00:00]"
)
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = (
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, "
"2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, "
"2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, "
"2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]\n"
"Categories (5, datetime64[ns, US/Eastern]): "
"[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,\n"
" "
"2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,\n"
" "
"2011-01-01 13:00:00-05:00]"
)
assert repr(c) == exp
def test_categorical_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00, 2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]
Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501
assert repr(c) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00, 2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]
Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
2011-01-01 13:00:00-05:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_int_with_nan(self):
c = Categorical([1, 2, np.nan])
c_exp = """[1, 2, NaN]\nCategories (2, int64): [1, 2]"""
assert repr(c) == c_exp
s = Series([1, 2, np.nan], dtype="object").astype("category")
s_exp = """0 1\n1 2\n2 NaN
dtype: category
Categories (2, int64): [1, 2]"""
assert repr(s) == s_exp
def test_categorical_repr_period(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]
Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]""" # noqa: E501
assert repr(c) == exp
idx = period_range("2011-01", freq="M", periods=5)
c = Categorical(idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05]
Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00,
18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
c = Categorical(idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days]
Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa: E501
assert repr(c) == exp
idx = timedelta_range("1 hours", periods=20)
c = Categorical(idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 20
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
c = Categorical(idx.append(idx), categories=idx, ordered=True)
exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]
Length: 40
Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 <
18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501
assert repr(c) == exp
def test_categorical_index_repr(self):
idx = CategoricalIndex(Categorical([1, 2, 3]))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501
assert repr(idx) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64)))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_ordered(self):
i = CategoricalIndex(Categorical([1, 2, 3], ordered=True))
exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True))
exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_datetime_ordered(self):
idx = date_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00',
'2011-01-01 11:00:00', '2011-01-01 12:00:00',
'2011-01-01 13:00:00'],
categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx), ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00',
'2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00',
'2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00',
'2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00',
'2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'],
categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period(self):
# test all length
idx = period_range("2011-01-01 09:00", freq="h", periods=1)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=2)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=3)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
i = CategoricalIndex(Categorical(idx.append(idx)))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00',
'2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00',
'2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_period_ordered(self):
idx = period_range("2011-01-01 09:00", freq="h", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00',
'2011-01-01 12:00', '2011-01-01 13:00'],
categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = period_range("2011-01", freq="M", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_index_repr_timedelta_ordered(self):
idx = timedelta_range("1 days", periods=5)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
idx = timedelta_range("1 hours", periods=10)
i = CategoricalIndex(Categorical(idx, ordered=True))
exp = """CategoricalIndex(['0 days 01:00:00', '1 days 01:00:00', '2 days 01:00:00',
'3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00',
'6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00',
'9 days 01:00:00'],
categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501
assert repr(i) == exp
def test_categorical_str_repr(self):
# GH 33676
result = repr(Categorical([1, "2", 3, 4]))
expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']"
assert result == expected

View File

@ -0,0 +1,128 @@
import numpy as np
import pytest
from pandas import (
Categorical,
Index,
)
import pandas._testing as tm
class TestCategoricalSort:
def test_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(
c.argsort(ascending=True), expected, check_dtype=False
)
expected = expected[::-1]
tm.assert_numpy_array_equal(
c.argsort(ascending=False), expected, check_dtype=False
)
def test_numpy_argsort(self):
c = Categorical([5, 3, 1, 4, 2], ordered=True)
expected = np.array([2, 4, 1, 3, 0])
tm.assert_numpy_array_equal(np.argsort(c), expected, check_dtype=False)
tm.assert_numpy_array_equal(
np.argsort(c, kind="mergesort"), expected, check_dtype=False
)
msg = "the 'axis' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, axis=0)
msg = "the 'order' parameter is not supported"
with pytest.raises(ValueError, match=msg):
np.argsort(c, order="C")
def test_sort_values(self):
# unordered cats are sortable
cat = Categorical(["a", "b", "b", "a"], ordered=False)
cat.sort_values()
cat = Categorical(["a", "c", "b", "d"], ordered=True)
# sort_values
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
cat = Categorical(
["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True
)
res = cat.sort_values()
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
res = cat.sort_values(ascending=False)
exp = np.array(["d", "c", "b", "a"], dtype=object)
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# sort (inplace order)
cat1 = cat.copy()
orig_codes = cat1._codes
cat1.sort_values(inplace=True)
assert cat1._codes is orig_codes
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
# reverse
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
res = cat.sort_values(ascending=False)
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
def test_sort_values_na_position(self):
# see gh-12882
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
exp_categories = Index([2, 5])
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values() # default arguments
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
res = cat.sort_values(ascending=True, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
res = cat.sort_values(ascending=False, na_position="first")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
res = cat.sort_values(ascending=True, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
res = cat.sort_values(ascending=False, na_position="last")
tm.assert_numpy_array_equal(res.__array__(), exp)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="last")
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
res = cat.sort_values(ascending=False, na_position="first")
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
exp_categories = Index(["a", "b", "c", "d"])
tm.assert_numpy_array_equal(res.__array__(), exp_val)
tm.assert_index_equal(res.categories, exp_categories)

View File

@ -0,0 +1,26 @@
from pandas import Categorical
import pandas._testing as tm
class SubclassedCategorical(Categorical):
pass
class TestCategoricalSubclassing:
def test_constructor(self):
sc = SubclassedCategorical(["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
tm.assert_categorical_equal(sc, Categorical(["a", "b", "c"]))
def test_from_codes(self):
sc = SubclassedCategorical.from_codes([1, 0, 2], ["a", "b", "c"])
assert isinstance(sc, SubclassedCategorical)
exp = Categorical.from_codes([1, 0, 2], ["a", "b", "c"])
tm.assert_categorical_equal(sc, exp)
def test_map(self):
sc = SubclassedCategorical(["a", "b", "c"])
res = sc.map(lambda x: x.upper(), na_action=None)
assert isinstance(res, SubclassedCategorical)
exp = Categorical(["A", "B", "C"])
tm.assert_categorical_equal(res, exp)

View File

@ -0,0 +1,89 @@
import numpy as np
import pytest
from pandas import Categorical
import pandas._testing as tm
@pytest.fixture(params=[True, False])
def allow_fill(request):
"""Boolean 'allow_fill' parameter for Categorical.take"""
return request.param
class TestTake:
# https://github.com/pandas-dev/pandas/issues/20664
def test_take_default_allow_fill(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
result = cat.take([0, -1])
assert result.equals(cat)
def test_take_positive_no_warning(self):
cat = Categorical(["a", "b"])
with tm.assert_produces_warning(None):
cat.take([0, 0])
def test_take_bounds(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical(["a", "b", "a"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "index 4 is out of bounds for( axis 0 with)? size 3"
with pytest.raises(IndexError, match=msg):
cat.take([4, 5], allow_fill=allow_fill)
def test_take_empty(self, allow_fill):
# https://github.com/pandas-dev/pandas/issues/20664
cat = Categorical([], categories=["a", "b"])
if allow_fill:
msg = "indices are out-of-bounds"
else:
msg = "cannot do a non-empty take from an empty axes"
with pytest.raises(IndexError, match=msg):
cat.take([0], allow_fill=allow_fill)
def test_positional_take(self, ordered):
cat = Categorical(["a", "a", "b", "b"], categories=["b", "a"], ordered=ordered)
result = cat.take([0, 1, 2], allow_fill=False)
expected = Categorical(
["a", "a", "b"], categories=cat.categories, ordered=ordered
)
tm.assert_categorical_equal(result, expected)
def test_positional_take_unobserved(self, ordered):
cat = Categorical(["a", "b"], categories=["a", "b", "c"], ordered=ordered)
result = cat.take([1, 0], allow_fill=False)
expected = Categorical(["b", "a"], categories=cat.categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)
def test_take_allow_fill(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "a", "b"])
result = cat.take([0, -1, -1], allow_fill=True)
expected = Categorical(["a", np.nan, np.nan], categories=["a", "b"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_with_negative_one(self):
# -1 was a category
cat = Categorical([-1, 0, 1])
result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1)
expected = Categorical([-1, -1, 0], categories=[-1, 0, 1])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
result = cat.take([0, 1, -1], fill_value="a", allow_fill=True)
expected = Categorical(["a", "b", "a"], categories=["a", "b", "c"])
tm.assert_categorical_equal(result, expected)
def test_take_fill_value_new_raises(self):
# https://github.com/pandas-dev/pandas/issues/23296
cat = Categorical(["a", "b", "c"])
xpr = r"Cannot setitem on a Categorical with a new category \(d\)"
with pytest.raises(TypeError, match=xpr):
cat.take([0, 1, -1], fill_value="d", allow_fill=True)

View File

@ -0,0 +1,19 @@
import pytest
import pandas._testing as tm
class TestCategoricalWarnings:
def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
from IPython.core.completer import provisionalcompleter
code = "import pandas as pd; c = pd.Categorical([])"
ip.run_cell(code)
# GH 31324 newer jedi version raises Deprecation warning;
# appears resolved 2021-02-02
with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
with provisionalcompleter("ignore"):
list(ip.Completer.completions("c.", 1))

View File

@ -0,0 +1,284 @@
import numpy as np
import pytest
from pandas._libs import iNaT
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestDatetimeArrayConstructor:
def test_from_sequence_invalid_type(self):
mi = pd.MultiIndex.from_product([np.arange(5), np.arange(5)])
with pytest.raises(TypeError, match="Cannot create a DatetimeArray"):
DatetimeArray._from_sequence(mi, dtype="M8[ns]")
def test_only_1dim_accepted(self):
arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]")
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 3-dim, we allow 2D to sneak in for ops purposes GH#29853
DatetimeArray(arr.reshape(2, 2, 1))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Only 1-dimensional"):
# 0-dim
DatetimeArray(arr[[0]].squeeze())
def test_freq_validation(self):
# GH#24623 check that invalid instances cannot be created with the
# public constructor
arr = np.arange(5, dtype=np.int64) * 3600 * 10**9
msg = (
"Inferred frequency h from passed values does not "
"conform to passed frequency W-SUN"
)
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, freq="W")
@pytest.mark.parametrize(
"meth",
[
DatetimeArray._from_sequence,
pd.to_datetime,
pd.DatetimeIndex,
],
)
def test_mixing_naive_tzaware_raises(self, meth):
# GH#24569
arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
msg = (
"Cannot mix tz-aware with tz-naive values|"
"Tz-aware datetime.datetime cannot be converted "
"to datetime64 unless utc=True"
)
for obj in [arr, arr[::-1]]:
# check that we raise regardless of whether naive is found
# before aware or vice-versa
with pytest.raises(ValueError, match=msg):
meth(obj)
def test_from_pandas_array(self):
arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10**9
result = DatetimeArray._from_sequence(arr, dtype="M8[ns]")._with_freq("infer")
expected = pd.date_range("1970-01-01", periods=5, freq="h")._data
tm.assert_datetime_array_equal(result, expected)
def test_mismatched_timezone_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
arr = DatetimeArray(
np.array(["2000-01-01T06:00:00"], dtype="M8[ns]"),
dtype=DatetimeTZDtype(tz="US/Central"),
)
dtype = DatetimeTZDtype(tz="US/Eastern")
msg = r"dtype=datetime64\[ns.*\] does not match data dtype datetime64\[ns.*\]"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=dtype)
# also with mismatched tzawareness
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr, dtype=np.dtype("M8[ns]"))
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match=msg):
DatetimeArray(arr.tz_localize(None), dtype=arr.dtype)
def test_non_array_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="list"):
DatetimeArray([1, 2, 3])
def test_bool_dtype_raises(self):
arr = np.array([1, 2, 3], dtype="bool")
depr_msg = "DatetimeArray.__init__ is deprecated"
msg = "Unexpected value for 'dtype': 'bool'. Must be"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr)
msg = r"dtype bool cannot be converted to datetime64\[ns\]"
with pytest.raises(TypeError, match=msg):
DatetimeArray._from_sequence(arr, dtype="M8[ns]")
with pytest.raises(TypeError, match=msg):
pd.DatetimeIndex(arr)
with pytest.raises(TypeError, match=msg):
pd.to_datetime(arr)
def test_incorrect_dtype_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="m8[s]")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Unexpected value for 'dtype'."):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), dtype="M8[D]")
def test_mismatched_values_dtype_units(self):
arr = np.array([1, 2, 3], dtype="M8[s]")
dtype = np.dtype("M8[ns]")
msg = "Values resolution does not match dtype."
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype)
dtype2 = DatetimeTZDtype(tz="UTC", unit="ns")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match=msg):
DatetimeArray(arr, dtype=dtype2)
def test_freq_infer_raises(self):
depr_msg = "DatetimeArray.__init__ is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(ValueError, match="Frequency inference"):
DatetimeArray(np.array([1, 2, 3], dtype="i8"), freq="infer")
def test_copy(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray._from_sequence(data, copy=False)
assert arr._ndarray is data
arr = DatetimeArray._from_sequence(data, copy=True)
assert arr._ndarray is not data
@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"])
def test_numpy_datetime_unit(self, unit):
data = np.array([1, 2, 3], dtype=f"M8[{unit}]")
arr = DatetimeArray._from_sequence(data)
assert arr.unit == unit
assert arr[0].unit == unit
class TestSequenceToDT64NS:
def test_tz_dtype_mismatch_raises(self):
arr = DatetimeArray._from_sequence(
["2000"], dtype=DatetimeTZDtype(tz="US/Central")
)
with pytest.raises(TypeError, match="data is already tz-aware"):
DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC"))
def test_tz_dtype_matches(self):
dtype = DatetimeTZDtype(tz="US/Central")
arr = DatetimeArray._from_sequence(["2000"], dtype=dtype)
result = DatetimeArray._from_sequence(arr, dtype=dtype)
tm.assert_equal(arr, result)
@pytest.mark.parametrize("order", ["F", "C"])
def test_2d(self, order):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = np.array(dti, dtype=object).reshape(3, 2)
if order == "F":
arr = arr.T
res = DatetimeArray._from_sequence(arr, dtype=dti.dtype)
expected = DatetimeArray._from_sequence(arr.ravel(), dtype=dti.dtype).reshape(
arr.shape
)
tm.assert_datetime_array_equal(res, expected)
# ----------------------------------------------------------------------------
# Arrow interaction
EXTREME_VALUES = [0, 123456789, None, iNaT, 2**63 - 1, -(2**63) + 1]
FINE_TO_COARSE_SAFE = [123_000_000_000, None, -123_000_000_000]
COARSE_TO_FINE_SAFE = [123, None, -123]
@pytest.mark.parametrize(
("pa_unit", "pd_unit", "pa_tz", "pd_tz", "data"),
[
("s", "s", "UTC", "UTC", EXTREME_VALUES),
("ms", "ms", "UTC", "Europe/Berlin", EXTREME_VALUES),
("us", "us", "US/Eastern", "UTC", EXTREME_VALUES),
("ns", "ns", "US/Central", "Asia/Kolkata", EXTREME_VALUES),
("ns", "s", "UTC", "UTC", FINE_TO_COARSE_SAFE),
("us", "ms", "UTC", "Europe/Berlin", FINE_TO_COARSE_SAFE),
("ms", "us", "US/Eastern", "UTC", COARSE_TO_FINE_SAFE),
("s", "ns", "US/Central", "Asia/Kolkata", COARSE_TO_FINE_SAFE),
],
)
def test_from_arrow_with_different_units_and_timezones_with(
pa_unit, pd_unit, pa_tz, pd_tz, data
):
pa = pytest.importorskip("pyarrow")
pa_type = pa.timestamp(pa_unit, tz=pa_tz)
arr = pa.array(data, type=pa_type)
dtype = DatetimeTZDtype(unit=pd_unit, tz=pd_tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(data, dtype=f"M8[{pa_unit}, UTC]").astype(
dtype, copy=False
)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
("unit", "tz"),
[
("s", "UTC"),
("ms", "Europe/Berlin"),
("us", "US/Eastern"),
("ns", "Asia/Kolkata"),
("ns", "UTC"),
],
)
def test_from_arrow_from_empty(unit, tz):
pa = pytest.importorskip("pyarrow")
data = []
arr = pa.array(data)
dtype = DatetimeTZDtype(unit=unit, tz=tz)
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]"))
expected = expected.tz_localize(tz=tz)
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)
def test_from_arrow_from_integers():
pa = pytest.importorskip("pyarrow")
data = [0, 123456789, None, 2**63 - 1, iNaT, -123456789]
arr = pa.array(data)
dtype = DatetimeTZDtype(unit="ns", tz="UTC")
result = dtype.__from_arrow__(arr)
expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]"))
expected = expected.tz_localize("UTC")
tm.assert_extension_array_equal(result, expected)
result = dtype.__from_arrow__(pa.chunked_array([arr]))
tm.assert_extension_array_equal(result, expected)

View File

@ -0,0 +1,44 @@
import pytest
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestAccumulator:
def test_accumulators_freq(self):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)._with_freq("infer")
result = arr._accumulate("cummin")
expected = DatetimeArray._from_sequence(["2000-01-01"] * 3, dtype="M8[ns]")
tm.assert_datetime_array_equal(result, expected)
result = arr._accumulate("cummax")
expected = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
"2000-01-03",
],
dtype="M8[ns]",
)
tm.assert_datetime_array_equal(result, expected)
@pytest.mark.parametrize("func", ["cumsum", "cumprod"])
def test_accumulators_disallowed(self, func):
# GH#50297
arr = DatetimeArray._from_sequence(
[
"2000-01-01",
"2000-01-02",
],
dtype="M8[ns]",
)._with_freq("infer")
with pytest.raises(TypeError, match=f"Accumulation {func}"):
arr._accumulate(func)

View File

@ -0,0 +1,183 @@
import numpy as np
import pytest
from pandas.core.dtypes.dtypes import DatetimeTZDtype
import pandas as pd
from pandas import NaT
import pandas._testing as tm
from pandas.core.arrays import DatetimeArray
class TestReductions:
@pytest.fixture(params=["s", "ms", "us", "ns"])
def unit(self, request):
return request.param
@pytest.fixture
def arr1d(self, tz_naive_fixture):
"""Fixture returning DatetimeArray with parametrized timezones"""
tz = tz_naive_fixture
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence(
[
"2000-01-03",
"2000-01-03",
"NaT",
"2000-01-02",
"2000-01-05",
"2000-01-04",
],
dtype=dtype,
)
return arr
def test_min_max(self, arr1d, unit):
arr = arr1d
arr = arr.as_unit(unit)
tz = arr.tz
result = arr.min()
expected = pd.Timestamp("2000-01-02", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.max()
expected = pd.Timestamp("2000-01-05", tz=tz).as_unit(unit)
assert result == expected
assert result.unit == expected.unit
result = arr.min(skipna=False)
assert result is NaT
result = arr.max(skipna=False)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_min_max_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.min(skipna=skipna)
assert result is NaT
result = arr.max(skipna=skipna)
assert result is NaT
@pytest.mark.parametrize("tz", [None, "US/Central"])
@pytest.mark.parametrize("skipna", [True, False])
def test_median_empty(self, skipna, tz):
dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]")
arr = DatetimeArray._from_sequence([], dtype=dtype)
result = arr.median(skipna=skipna)
assert result is NaT
arr = arr.reshape(0, 3)
result = arr.median(axis=0, skipna=skipna)
expected = type(arr)._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=skipna)
expected = type(arr)._from_sequence([], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_median(self, arr1d):
arr = arr1d
result = arr.median()
assert result == arr[0]
result = arr.median(skipna=False)
assert result is NaT
result = arr.dropna().median(skipna=False)
assert result == arr[0]
result = arr.median(axis=0)
assert result == arr[0]
def test_median_axis(self, arr1d):
arr = arr1d
assert arr.median(axis=0) == arr.median()
assert arr.median(axis=0, skipna=False) is NaT
msg = r"abs\(axis\) must be less than ndim"
with pytest.raises(ValueError, match=msg):
arr.median(axis=1)
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered:RuntimeWarning")
def test_median_2d(self, arr1d):
arr = arr1d.reshape(1, -1)
# axis = None
assert arr.median() == arr1d.median()
assert arr.median(skipna=False) is NaT
# axis = 0
result = arr.median(axis=0)
expected = arr1d
tm.assert_equal(result, expected)
# Since column 3 is all-NaT, we get NaT there with or without skipna
result = arr.median(axis=0, skipna=False)
expected = arr1d
tm.assert_equal(result, expected)
# axis = 1
result = arr.median(axis=1)
expected = type(arr)._from_sequence([arr1d.median()], dtype=arr.dtype)
tm.assert_equal(result, expected)
result = arr.median(axis=1, skipna=False)
expected = type(arr)._from_sequence([NaT], dtype=arr.dtype)
tm.assert_equal(result, expected)
def test_mean(self, arr1d):
arr = arr1d
# manually verified result
expected = arr[0] + 0.4 * pd.Timedelta(days=1)
result = arr.mean()
assert result == expected
result = arr.mean(skipna=False)
assert result is NaT
result = arr.dropna().mean(skipna=False)
assert result == expected
result = arr.mean(axis=0)
assert result == expected
def test_mean_2d(self):
dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
dta = dti._data.reshape(3, 2)
result = dta.mean(axis=0)
expected = dta[1]
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=1)
expected = dta[:, 0] + pd.Timedelta(hours=12)
tm.assert_datetime_array_equal(result, expected)
result = dta.mean(axis=None)
expected = dti.mean()
assert result == expected
@pytest.mark.parametrize("skipna", [True, False])
def test_mean_empty(self, arr1d, skipna):
arr = arr1d[:0]
assert arr.mean(skipna=skipna) is NaT
arr2d = arr.reshape(0, 3)
result = arr2d.mean(axis=0, skipna=skipna)
expected = DatetimeArray._from_sequence([NaT, NaT, NaT], dtype=arr.dtype)
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=1, skipna=skipna)
expected = arr # i.e. 1D, empty
tm.assert_datetime_array_equal(result, expected)
result = arr2d.mean(axis=None, skipna=skipna)
assert result is NaT

View File

@ -0,0 +1,48 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
@pytest.fixture(params=[Float32Dtype, Float64Dtype])
def dtype(request):
"""Parametrized fixture returning a float 'dtype'"""
return request.param()
@pytest.fixture
def data(dtype):
"""Fixture returning 'data' array according to parametrized float 'dtype'"""
return pd.array(
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0],
dtype=dtype,
)
@pytest.fixture
def data_missing(dtype):
"""
Fixture returning array with missing data according to parametrized float
'dtype'.
"""
return pd.array([np.nan, 0.1], dtype=dtype)
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' float arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing

View File

@ -0,0 +1,244 @@
import operator
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
# Basic test for the arithmetic array ops
# -----------------------------------------------------------------------------
@pytest.mark.parametrize(
"opname, exp",
[
("add", [1.1, 2.2, None, None, 5.5]),
("mul", [0.1, 0.4, None, None, 2.5]),
("sub", [0.9, 1.8, None, None, 4.5]),
("truediv", [10.0, 10.0, None, None, 10.0]),
("floordiv", [9.0, 9.0, None, None, 10.0]),
("mod", [0.1, 0.2, None, None, 0.0]),
],
ids=["add", "mul", "sub", "div", "floordiv", "mod"],
)
def test_array_op(dtype, opname, exp):
a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype)
b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype)
op = getattr(operator, opname)
result = op(a, b)
expected = pd.array(exp, dtype=dtype)
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)])
def test_divide_by_zero(dtype, zero, negative):
# TODO pending NA/NaN discussion
# https://github.com/pandas-dev/pandas/issues/32265/
a = pd.array([0, 1, -1, None], dtype=dtype)
result = a / zero
expected = FloatingArray(
np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype),
np.array([False, False, False, True]),
)
if negative:
expected *= -1
tm.assert_extension_array_equal(result, expected)
def test_pow_scalar(dtype):
a = pd.array([-1, 0, 1, None, 2], dtype=dtype)
result = a**0
expected = pd.array([1, 1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**1
expected = pd.array([-1, 0, 1, None, 2], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**pd.NA
expected = pd.array([None, None, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = a**np.nan
# TODO np.nan should be converted to pd.NA / missing before operation?
expected = FloatingArray(
np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype),
mask=a._mask,
)
tm.assert_extension_array_equal(result, expected)
# reversed
a = a[1:] # Can't raise integers to negative powers.
result = 0**a
expected = pd.array([1, 0, None, 0], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = 1**a
expected = pd.array([1, 1, 1, 1], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = pd.NA**a
expected = pd.array([1, None, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
result = np.nan**a
expected = FloatingArray(
np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask
)
tm.assert_extension_array_equal(result, expected)
def test_pow_array(dtype):
a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype)
b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype)
result = a**b
expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype)
tm.assert_extension_array_equal(result, expected)
def test_rpow_one_to_na():
# https://github.com/pandas-dev/pandas/issues/22022
# https://github.com/pandas-dev/pandas/issues/29997
arr = pd.array([np.nan, np.nan], dtype="Float64")
result = np.array([1.0, 2.0]) ** arr
expected = pd.array([1.0, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("other", [0, 0.5])
def test_arith_zero_dim_ndarray(other):
arr = pd.array([1, None, 2], dtype="Float64")
result = arr + np.array(other)
expected = arr + other
tm.assert_equal(result, expected)
# Test generic characteristics / errors
# -----------------------------------------------------------------------------
def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
op = all_arithmetic_operators
s = pd.Series(data)
ops = getattr(s, op)
if using_infer_string:
import pyarrow as pa
errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
else:
errs = TypeError
# invalid scalars
msg = "|".join(
[
r"can only perform ops with numeric values",
r"FloatingArray cannot perform the operation mod",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
r"can only concatenate str \(not \"float\"\) to str",
"ufunc '.*' not supported for the input types, and the inputs could not",
"ufunc '.*' did not contain a loop with signature matching types",
"Concatenation operation is not implemented for NumPy arrays",
"has no kernel",
"not implemented",
]
)
with pytest.raises(errs, match=msg):
ops("foo")
with pytest.raises(errs, match=msg):
ops(pd.Timestamp("20180101"))
# invalid array-likes
with pytest.raises(errs, match=msg):
ops(pd.Series("foo", index=s.index))
msg = "|".join(
[
"can only perform ops with numeric values",
"cannot perform .* with this index type: DatetimeArray",
"Addition/subtraction of integers and integer-arrays "
"with DatetimeArray is no longer supported. *",
"unsupported operand type",
"not all arguments converted during string formatting",
"can't multiply sequence by non-int of type 'float'",
"ufunc 'subtract' cannot use operands with types dtype",
(
"ufunc 'add' cannot use operands with types "
rf"dtype\('{tm.ENDIAN}M8\[ns\]'\)"
),
r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
"cannot subtract DatetimeArray from ndarray",
"has no kernel",
"not implemented",
]
)
with pytest.raises(errs, match=msg):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
# Various
# -----------------------------------------------------------------------------
def test_cross_type_arithmetic():
df = pd.DataFrame(
{
"A": pd.array([1, 2, np.nan], dtype="Float64"),
"B": pd.array([1, np.nan, 3], dtype="Float32"),
"C": np.array([1, 2, 3], dtype="float64"),
}
)
result = df.A + df.C
expected = pd.Series([2, 4, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
result = (df.A + df.C) * 3 == 12
expected = pd.Series([False, True, None], dtype="boolean")
tm.assert_series_equal(result, expected)
result = df.A + df.B
expected = pd.Series([2, np.nan, np.nan], dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"source, neg_target, abs_target",
[
([1.1, 2.2, 3.3], [-1.1, -2.2, -3.3], [1.1, 2.2, 3.3]),
([1.1, 2.2, None], [-1.1, -2.2, None], [1.1, 2.2, None]),
([-1.1, 0.0, 1.1], [1.1, 0.0, -1.1], [1.1, 0.0, 1.1]),
],
)
def test_unary_float_operators(float_ea_dtype, source, neg_target, abs_target):
# GH38794
dtype = float_ea_dtype
arr = pd.array(source, dtype=dtype)
neg_result, pos_result, abs_result = -arr, +arr, abs(arr)
neg_target = pd.array(neg_target, dtype=dtype)
abs_target = pd.array(abs_target, dtype=dtype)
tm.assert_extension_array_equal(neg_result, neg_target)
tm.assert_extension_array_equal(pos_result, arr)
assert not tm.shares_memory(pos_result, arr)
tm.assert_extension_array_equal(abs_result, abs_target)
def test_bitwise(dtype):
left = pd.array([1, None, 3, 4], dtype=dtype)
right = pd.array([None, 3, 5, 4], dtype=dtype)
with pytest.raises(TypeError, match="unsupported operand type"):
left | right
with pytest.raises(TypeError, match="unsupported operand type"):
left & right
with pytest.raises(TypeError, match="unsupported operand type"):
left ^ right

View File

@ -0,0 +1,128 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
def test_astype():
# with missing values
arr = pd.array([0.1, 0.2, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")
with pytest.raises(ValueError, match="cannot convert float NaN to bool"):
arr.astype("bool")
result = arr.astype("float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
# no missing values
arr = pd.array([0.0, 1.0, 0.5], dtype="Float64")
result = arr.astype("int64")
expected = np.array([0, 1, 0], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
result = arr.astype("bool")
expected = np.array([False, True, True], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
def test_astype_to_floating_array():
# astype to FloatingArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("Float64")
tm.assert_extension_array_equal(result, arr)
result = arr.astype(pd.Float64Dtype())
tm.assert_extension_array_equal(result, arr)
result = arr.astype("Float32")
expected = pd.array([0.0, 1.0, None], dtype="Float32")
tm.assert_extension_array_equal(result, expected)
def test_astype_to_boolean_array():
# astype to BooleanArray
arr = pd.array([0.0, 1.0, None], dtype="Float64")
result = arr.astype("boolean")
expected = pd.array([False, True, None], dtype="boolean")
tm.assert_extension_array_equal(result, expected)
result = arr.astype(pd.BooleanDtype())
tm.assert_extension_array_equal(result, expected)
def test_astype_to_integer_array():
# astype to IntegerArray
arr = pd.array([0.0, 1.5, None], dtype="Float64")
result = arr.astype("Int64")
expected = pd.array([0, 1, None], dtype="Int64")
tm.assert_extension_array_equal(result, expected)
def test_astype_str():
a = pd.array([0.1, 0.2, None], dtype="Float64")
expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
tm.assert_numpy_array_equal(a.astype(str), expected)
tm.assert_numpy_array_equal(a.astype("str"), expected)
def test_astype_copy():
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
# copy=True -> ensure both data and mask are actual copies
result = arr.astype("Float64", copy=True)
assert result is not arr
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
# copy=False
result = arr.astype("Float64", copy=False)
assert result is arr
assert np.shares_memory(result._data, arr._data)
assert np.shares_memory(result._mask, arr._mask)
result[0] = 10
assert arr[0] == 10
result[0] = pd.NA
assert arr[0] is pd.NA
# astype to different dtype -> always needs a copy -> even with copy=False
# we need to ensure that also the mask is actually copied
arr = pd.array([0.1, 0.2, None], dtype="Float64")
orig = pd.array([0.1, 0.2, None], dtype="Float64")
result = arr.astype("Float32", copy=False)
assert not tm.shares_memory(result, arr)
result[0] = 10
tm.assert_extension_array_equal(arr, orig)
result[0] = pd.NA
tm.assert_extension_array_equal(arr, orig)
def test_astype_object(dtype):
arr = pd.array([1.0, pd.NA], dtype=dtype)
result = arr.astype(object)
expected = np.array([1.0, pd.NA], dtype=object)
tm.assert_numpy_array_equal(result, expected)
# check exact element types
assert isinstance(result[0], float)
assert result[1] is pd.NA
def test_Float64_conversion():
# GH#40729
testseries = pd.Series(["1", "2", "3", "4"], dtype="object")
result = testseries.astype(pd.Float64Dtype())
expected = pd.Series([1.0, 2.0, 3.0, 4.0], dtype=pd.Float64Dtype())
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.tests.arrays.masked_shared import (
ComparisonOps,
NumericOps,
)
class TestComparisonOps(NumericOps, ComparisonOps):
@pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1])
def test_scalar(self, other, comparison_op, dtype):
ComparisonOps.test_scalar(self, other, comparison_op, dtype)
def test_compare_with_integerarray(self, comparison_op):
op = comparison_op
a = pd.array([0, 1, None] * 3, dtype="Int64")
b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64")
other = b.astype("Int64")
expected = op(a, other)
result = op(a, b)
tm.assert_extension_array_equal(result, expected)
expected = op(other, a)
result = op(b, a)
tm.assert_extension_array_equal(result, expected)
def test_equals():
# GH-30652
# equals is generally tested in /tests/extension/base/methods, but this
# specifically tests that two arrays of the same class but different dtype
# do not evaluate equal
a1 = pd.array([1, 2, None], dtype="Float64")
a2 = pd.array([1, 2, None], dtype="Float32")
assert a1.equals(a2) is False
def test_equals_nan_vs_na():
# GH#44382
mask = np.zeros(3, dtype=bool)
data = np.array([1.0, np.nan, 3.0], dtype=np.float64)
left = FloatingArray(data, mask)
assert left.equals(left)
tm.assert_extension_array_equal(left, left)
assert left.equals(left.copy())
assert left.equals(FloatingArray(data.copy(), mask.copy()))
mask2 = np.array([False, True, False], dtype=bool)
data2 = np.array([1.0, 2.0, 3.0], dtype=np.float64)
right = FloatingArray(data2, mask2)
assert right.equals(right)
tm.assert_extension_array_equal(right, right)
assert not left.equals(right)
# with mask[1] = True, the only difference is data[1], which should
# not matter for equals
mask[1] = True
assert left.equals(right)

View File

@ -0,0 +1,20 @@
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"to_concat_dtypes, result_dtype",
[
(["Float64", "Float64"], "Float64"),
(["Float32", "Float64"], "Float64"),
(["Float32", "Float32"], "Float32"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,204 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_uses_pandas_na():
a = pd.array([1, None], dtype=Float64Dtype())
assert a[1] is pd.NA
def test_floating_array_constructor():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
expected = pd.array([1, 2, 3, np.nan], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
tm.assert_numpy_array_equal(result._data, values)
tm.assert_numpy_array_equal(result._mask, mask)
msg = r".* should be .* numpy array. Use the 'pd.array' function instead"
with pytest.raises(TypeError, match=msg):
FloatingArray(values.tolist(), mask)
with pytest.raises(TypeError, match=msg):
FloatingArray(values, mask.tolist())
with pytest.raises(TypeError, match=msg):
FloatingArray(values.astype(int), mask)
msg = r"__init__\(\) missing 1 required positional argument: 'mask'"
with pytest.raises(TypeError, match=msg):
FloatingArray(values)
def test_floating_array_disallows_float16():
# GH#44715
arr = np.array([1, 2], dtype=np.float16)
mask = np.array([False, False])
msg = "FloatingArray does not support np.float16 dtype"
with pytest.raises(TypeError, match=msg):
FloatingArray(arr, mask)
def test_floating_array_disallows_Float16_dtype(request):
# GH#44715
with pytest.raises(TypeError, match="data type 'Float16' not understood"):
pd.array([1.0, 2.0], dtype="Float16")
def test_floating_array_constructor_copy():
values = np.array([1, 2, 3, 4], dtype="float64")
mask = np.array([False, False, False, True], dtype="bool")
result = FloatingArray(values, mask)
assert result._data is values
assert result._mask is mask
result = FloatingArray(values, mask, copy=True)
assert result._data is not values
assert result._mask is not mask
def test_to_array():
result = pd.array([0.1, 0.2, 0.3, 0.4])
expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"a, b",
[
([1, None], [1, pd.NA]),
([None], [pd.NA]),
([None, np.nan], [pd.NA, pd.NA]),
([1, np.nan], [1, pd.NA]),
([np.nan], [pd.NA]),
],
)
def test_to_array_none_is_nan(a, b):
result = pd.array(a, dtype="Float64")
expected = pd.array(b, dtype="Float64")
tm.assert_extension_array_equal(result, expected)
def test_to_array_mixed_integer_float():
result = pd.array([1, 2.0])
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = pd.array([1, None, 2.0])
expected = pd.array([1.0, None, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
["foo", "bar"],
"foo",
1,
1.0,
pd.date_range("20130101", periods=2),
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = "|".join(
[
"cannot be converted to FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
r"float\(\) argument must be a string or a (real )?number, not 'dict'",
"could not convert string to float: 'foo'",
r"could not convert string to float: np\.str_\('foo'\)",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
@pytest.mark.parametrize("values", [["1", "2", None], ["1.5", "2", None]])
def test_construct_from_float_strings(values):
# see also test_to_integer_array_str
expected = pd.array([float(values[0]), 2, None], dtype="Float64")
res = pd.array(values, dtype="Float64")
tm.assert_extension_array_equal(res, expected)
res = FloatingArray._from_sequence(values)
tm.assert_extension_array_equal(res, expected)
def test_to_array_inferred_dtype():
# if values has dtype -> respect it
result = pd.array(np.array([1, 2], dtype="float32"))
assert result.dtype == Float32Dtype()
# if values have no dtype -> always float64
result = pd.array([1.0, 2.0])
assert result.dtype == Float64Dtype()
def test_to_array_dtype_keyword():
result = pd.array([1, 2], dtype="Float32")
assert result.dtype == Float32Dtype()
# if values has dtype -> override it
result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64")
assert result.dtype == Float64Dtype()
def test_to_array_integer():
result = pd.array([1, 2], dtype="Float64")
expected = pd.array([1.0, 2.0], dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# for integer dtypes, the itemsize is not preserved
# TODO can we specify "floating" in general?
result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64")
assert result.dtype == Float64Dtype()
@pytest.mark.parametrize(
"bool_values, values, target_dtype, expected_dtype",
[
([False, True], [0, 1], Float64Dtype(), Float64Dtype()),
([False, True], [0, 1], "Float64", Float64Dtype()),
([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()),
],
)
def test_to_array_bool(bool_values, values, target_dtype, expected_dtype):
result = pd.array(bool_values, dtype=target_dtype)
assert result.dtype == expected_dtype
expected = pd.array(values, dtype=target_dtype)
tm.assert_extension_array_equal(result, expected)
def test_series_from_float(data):
# construct from our dtype & string dtype
dtype = data.dtype
# from float
expected = pd.Series(data)
result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype))
tm.assert_series_equal(result, expected)
# from list
expected = pd.Series(data)
result = pd.Series(np.array(data).tolist(), dtype=str(dtype))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,12 @@
import numpy as np
import pandas as pd
def test_contains_nan():
# GH#52840
arr = pd.array(range(5)) / 0
assert np.isnan(arr._data[0])
assert not arr.isna()[0]
assert np.nan in arr

View File

@ -0,0 +1,194 @@
import numpy as np
import pytest
from pandas.compat import IS64
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize("ufunc", [np.abs, np.sign])
# np.sign emits a warning with nans, <https://github.com/numpy/numpy/issues/15127>
@pytest.mark.filterwarnings("ignore:invalid value encountered in sign:RuntimeWarning")
def test_ufuncs_single(ufunc):
a = pd.array([1, 2, -3, np.nan], dtype="Float64")
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
result = ufunc(s)
expected = pd.Series(expected)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt])
def test_ufuncs_single_float(ufunc):
a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64")
with np.errstate(invalid="ignore"):
result = ufunc(a)
expected = pd.array(ufunc(a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
s = pd.Series(a)
with np.errstate(invalid="ignore"):
result = ufunc(s)
expected = pd.Series(ufunc(s.astype(float)), dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("ufunc", [np.add, np.subtract])
def test_ufuncs_binary_float(ufunc):
# two FloatingArrays
a = pd.array([1, 0.2, -3, np.nan], dtype="Float64")
result = ufunc(a, a)
expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with numpy array
arr = np.array([1, 2, 3, 4])
result = ufunc(a, arr)
expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(arr, a)
expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
# FloatingArray with scalar
result = ufunc(a, 1)
expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
result = ufunc(1, a)
expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64")
tm.assert_extension_array_equal(result, expected)
@pytest.mark.parametrize("values", [[0, 1], [0, None]])
def test_ufunc_reduce_raises(values):
arr = pd.array(values, dtype="Float64")
res = np.add.reduce(arr)
expected = arr.sum(skipna=False)
tm.assert_almost_equal(res, expected)
@pytest.mark.skipif(not IS64, reason="GH 36579: fail on 32-bit system")
@pytest.mark.parametrize(
"pandasmethname, kwargs",
[
("var", {"ddof": 0}),
("var", {"ddof": 1}),
("std", {"ddof": 0}),
("std", {"ddof": 1}),
("kurtosis", {}),
("skew", {}),
("sem", {}),
],
)
def test_stat_method(pandasmethname, kwargs):
s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64")
pandasmeth = getattr(s, pandasmethname)
result = pandasmeth(**kwargs)
s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64")
pandasmeth = getattr(s2, pandasmethname)
expected = pandasmeth(**kwargs)
assert expected == result
def test_value_counts_na():
arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = arr.value_counts(dropna=False)
idx = pd.Index([0.1, 0.2, pd.NA], dtype=arr.dtype)
assert idx.dtype == arr.dtype
expected = pd.Series([2, 1, 1], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
result = arr.value_counts(dropna=True)
expected = pd.Series([2, 1], index=idx[:-1], dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_empty():
ser = pd.Series([], dtype="Float64")
result = ser.value_counts()
idx = pd.Index([], dtype="Float64")
assert idx.dtype == "Float64"
expected = pd.Series([], index=idx, dtype="Int64", name="count")
tm.assert_series_equal(result, expected)
def test_value_counts_with_normalize():
ser = pd.Series([0.1, 0.2, 0.1, pd.NA], dtype="Float64")
result = ser.value_counts(normalize=True)
expected = pd.Series([2, 1], index=ser[:2], dtype="Float64", name="proportion") / 3
assert expected.index.dtype == ser.dtype
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 4])
def test_floating_array_sum(skipna, min_count, dtype):
arr = pd.array([1, 2, 3, None], dtype=dtype)
result = arr.sum(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 6.0
else:
assert result is pd.NA
@pytest.mark.parametrize(
"values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)]
)
def test_floating_array_numpy_sum(values, expected):
arr = pd.array(values, dtype="Float64")
result = np.sum(arr)
assert result == expected
@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"])
def test_preserve_dtypes(op):
df = pd.DataFrame(
{
"A": ["a", "b", "b"],
"B": [1, None, 3],
"C": pd.array([0.1, None, 3.0], dtype="Float64"),
}
)
# op
result = getattr(df.C, op)()
assert isinstance(result, np.float64)
# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame(
{"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")},
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("method", ["min", "max"])
def test_floating_array_min_max(skipna, method, dtype):
arr = pd.array([0.0, 1.0, None], dtype=dtype)
func = getattr(arr, method)
result = func(skipna=skipna)
if skipna:
assert result == (0 if method == "min" else 1)
else:
assert result is pd.NA
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize("min_count", [0, 9])
def test_floating_array_prod(skipna, min_count, dtype):
arr = pd.array([1.0, 2.0, None], dtype=dtype)
result = arr.prod(skipna=skipna, min_count=min_count)
if skipna and min_count == 0:
assert result == 2
else:
assert result is pd.NA

View File

@ -0,0 +1,47 @@
import numpy as np
import pytest
import pandas as pd
from pandas.core.arrays.floating import (
Float32Dtype,
Float64Dtype,
)
def test_dtypes(dtype):
# smoke tests on auto dtype construction
np.dtype(dtype.type).kind == "f"
assert dtype.name is not None
@pytest.mark.parametrize(
"dtype, expected",
[(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")],
)
def test_repr_dtype(dtype, expected):
assert repr(dtype) == expected
def test_repr_array():
result = repr(pd.array([1.0, None, 3.0]))
expected = "<FloatingArray>\n[1.0, <NA>, 3.0]\nLength: 3, dtype: Float64"
assert result == expected
def test_repr_array_long():
data = pd.array([1.0, 2.0, None] * 1000)
expected = """<FloatingArray>
[ 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0,
...
<NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>, 1.0, 2.0, <NA>]
Length: 3000, dtype: Float64"""
result = repr(data)
assert result == expected
def test_frame_repr(data_missing):
df = pd.DataFrame({"A": data_missing})
result = repr(df)
expected = " A\n0 <NA>\n1 0.1"
assert result == expected

View File

@ -0,0 +1,132 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import FloatingArray
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy(box):
con = pd.Series if box else pd.array
# default (with or without missing values) -> object dtype
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy()
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_float(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to float, otherwise raises
arr = con([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, 0.3], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
arr = con([0.1, 0.2, None], dtype="Float64")
result = arr.to_numpy(dtype="float64")
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="float64", na_value=np.nan)
expected = np.array([0.1, 0.2, np.nan], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_int(box):
con = pd.Series if box else pd.array
# no missing values -> can convert to int, otherwise raises
arr = con([1.0, 2.0, 3.0], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([1, 2, 3], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
arr = con([1.0, 2.0, None], dtype="Float64")
with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"):
result = arr.to_numpy(dtype="int64")
# automatic casting (floors the values)
arr = con([0.1, 0.9, 1.1], dtype="Float64")
result = arr.to_numpy(dtype="int64")
expected = np.array([0, 0, 1], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_value(box):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype=object, na_value=None)
expected = np.array([0.0, 1.0, None], dtype="object")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype=bool, na_value=False)
expected = np.array([False, True, False], dtype="bool")
tm.assert_numpy_array_equal(result, expected)
result = arr.to_numpy(dtype="int64", na_value=-99)
expected = np.array([0, 1, -99], dtype="int64")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_na_value_with_nan():
# array with both NaN and NA -> only fill NA with `na_value`
arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True]))
result = arr.to_numpy(dtype="float64", na_value=-1)
expected = np.array([0.0, np.nan, -1.0], dtype="float64")
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_dtype(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0], dtype="Float64")
result = arr.to_numpy(dtype=dtype)
expected = np.array([0, 1], dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize("dtype", ["int32", "int64", "bool"])
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_na_raises(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
with pytest.raises(ValueError, match=dtype):
arr.to_numpy(dtype=dtype)
@pytest.mark.parametrize("box", [True, False], ids=["series", "array"])
def test_to_numpy_string(box, dtype):
con = pd.Series if box else pd.array
arr = con([0.0, 1.0, None], dtype="Float64")
result = arr.to_numpy(dtype="str")
expected = np.array([0.0, 1.0, pd.NA], dtype=f"{tm.ENDIAN}U32")
tm.assert_numpy_array_equal(result, expected)
def test_to_numpy_copy():
# to_numpy can be zero-copy if no missing values
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64")
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([10, 0.2, 0.3], dtype="Float64"))
arr = pd.array([0.1, 0.2, 0.3], dtype="Float64")
result = arr.to_numpy(dtype="float64", copy=True)
result[0] = 10
tm.assert_extension_array_equal(arr, pd.array([0.1, 0.2, 0.3], dtype="Float64"))

Some files were not shown because too many files have changed in this diff Show More