venv
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,9 @@
|
||||
from typing import Any
|
||||
|
||||
from pandas import Index
|
||||
|
||||
|
||||
def allow_na_ops(obj: Any) -> bool:
|
||||
"""Whether to skip test cases including NaN"""
|
||||
is_bool_index = isinstance(obj, Index) and obj.inferred_type == "boolean"
|
||||
return not is_bool_index and obj._can_hold_na
|
@ -0,0 +1,179 @@
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.accessor import PandasDelegate
|
||||
from pandas.core.base import (
|
||||
NoNewAttributesMixin,
|
||||
PandasObject,
|
||||
)
|
||||
|
||||
|
||||
def series_via_frame_from_dict(x, **kwargs):
|
||||
return DataFrame({"a": x}, **kwargs)["a"]
|
||||
|
||||
|
||||
def series_via_frame_from_scalar(x, **kwargs):
|
||||
return DataFrame(x, **kwargs)[0]
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
Series,
|
||||
series_via_frame_from_dict,
|
||||
series_via_frame_from_scalar,
|
||||
Index,
|
||||
],
|
||||
ids=["Series", "DataFrame-dict", "DataFrame-array", "Index"],
|
||||
)
|
||||
def constructor(request):
|
||||
return request.param
|
||||
|
||||
|
||||
class TestPandasDelegate:
|
||||
class Delegator:
|
||||
_properties = ["prop"]
|
||||
_methods = ["test_method"]
|
||||
|
||||
def _set_prop(self, value):
|
||||
self.prop = value
|
||||
|
||||
def _get_prop(self):
|
||||
return self.prop
|
||||
|
||||
prop = property(_get_prop, _set_prop, doc="foo property")
|
||||
|
||||
def test_method(self, *args, **kwargs):
|
||||
"""a test method"""
|
||||
|
||||
class Delegate(PandasDelegate, PandasObject):
|
||||
def __init__(self, obj) -> None:
|
||||
self.obj = obj
|
||||
|
||||
def test_invalid_delegation(self):
|
||||
# these show that in order for the delegation to work
|
||||
# the _delegate_* methods need to be overridden to not raise
|
||||
# a TypeError
|
||||
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator,
|
||||
accessors=self.Delegator._properties,
|
||||
typ="property",
|
||||
)
|
||||
self.Delegate._add_delegate_accessors(
|
||||
delegate=self.Delegator, accessors=self.Delegator._methods, typ="method"
|
||||
)
|
||||
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
|
||||
msg = "You cannot access the property prop"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop
|
||||
|
||||
msg = "The property prop cannot be set"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop = 5
|
||||
|
||||
msg = "You cannot access the property prop"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
delegate.prop
|
||||
|
||||
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
|
||||
def test_memory_usage(self):
|
||||
# Delegate does not implement memory_usage.
|
||||
# Check that we fall back to in-built `__sizeof__`
|
||||
# GH 12924
|
||||
delegate = self.Delegate(self.Delegator())
|
||||
sys.getsizeof(delegate)
|
||||
|
||||
|
||||
class TestNoNewAttributesMixin:
|
||||
def test_mixin(self):
|
||||
class T(NoNewAttributesMixin):
|
||||
pass
|
||||
|
||||
t = T()
|
||||
assert not hasattr(t, "__frozen")
|
||||
|
||||
t.a = "test"
|
||||
assert t.a == "test"
|
||||
|
||||
t._freeze()
|
||||
assert "__frozen" in dir(t)
|
||||
assert getattr(t, "__frozen")
|
||||
msg = "You cannot add any new attribute"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
t.b = "test"
|
||||
|
||||
assert not hasattr(t, "b")
|
||||
|
||||
|
||||
class TestConstruction:
|
||||
# test certain constructor behaviours on dtype inference across Series,
|
||||
# Index and DataFrame
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"a",
|
||||
[
|
||||
np.array(["2263-01-01"], dtype="datetime64[D]"),
|
||||
np.array([datetime(2263, 1, 1)], dtype=object),
|
||||
np.array([np.datetime64("2263-01-01", "D")], dtype=object),
|
||||
np.array(["2263-01-01"], dtype=object),
|
||||
],
|
||||
ids=[
|
||||
"datetime64[D]",
|
||||
"object-datetime.datetime",
|
||||
"object-numpy-scalar",
|
||||
"object-string",
|
||||
],
|
||||
)
|
||||
def test_constructor_datetime_outofbound(
|
||||
self, a, constructor, request, using_infer_string
|
||||
):
|
||||
# GH-26853 (+ bug GH-26206 out of bound non-ns unit)
|
||||
|
||||
# No dtype specified (dtype inference)
|
||||
# datetime64[non-ns] raise error, other cases result in object dtype
|
||||
# and preserve original data
|
||||
if a.dtype.kind == "M":
|
||||
# Can't fit in nanosecond bounds -> get the nearest supported unit
|
||||
result = constructor(a)
|
||||
assert result.dtype == "M8[s]"
|
||||
else:
|
||||
result = constructor(a)
|
||||
if using_infer_string and "object-string" in request.node.callspec.id:
|
||||
assert result.dtype == "string"
|
||||
else:
|
||||
assert result.dtype == "object"
|
||||
tm.assert_numpy_array_equal(result.to_numpy(), a)
|
||||
|
||||
# Explicit dtype specified
|
||||
# Forced conversion fails for all -> all cases raise error
|
||||
msg = "Out of bounds|Out of bounds .* present at position 0"
|
||||
with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg):
|
||||
constructor(a, dtype="datetime64[ns]")
|
||||
|
||||
def test_constructor_datetime_nonns(self, constructor):
|
||||
arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]")
|
||||
dta = pd.core.arrays.DatetimeArray._simple_new(arr, dtype=arr.dtype)
|
||||
expected = constructor(dta)
|
||||
assert expected.dtype == arr.dtype
|
||||
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# https://github.com/pandas-dev/pandas/issues/34843
|
||||
arr.flags.writeable = False
|
||||
result = constructor(arr)
|
||||
tm.assert_equal(result, expected)
|
@ -0,0 +1,562 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import DatetimeTZDtype
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays import (
|
||||
DatetimeArray,
|
||||
IntervalArray,
|
||||
NumpyExtensionArray,
|
||||
PeriodArray,
|
||||
SparseArray,
|
||||
TimedeltaArray,
|
||||
)
|
||||
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
|
||||
|
||||
|
||||
class TestToIterable:
|
||||
# test that we convert an iterable to python types
|
||||
|
||||
dtypes = [
|
||||
("int8", int),
|
||||
("int16", int),
|
||||
("int32", int),
|
||||
("int64", int),
|
||||
("uint8", int),
|
||||
("uint16", int),
|
||||
("uint32", int),
|
||||
("uint64", int),
|
||||
("float16", float),
|
||||
("float32", float),
|
||||
("float64", float),
|
||||
("datetime64[ns]", Timestamp),
|
||||
("datetime64[ns, US/Eastern]", Timestamp),
|
||||
("timedelta64[ns]", Timedelta),
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable(self, index_or_series, method, dtype, rdtype):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
if dtype == "float16" and issubclass(typ, pd.Index):
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
typ([1], dtype=dtype)
|
||||
return
|
||||
s = typ([1], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype, obj",
|
||||
[
|
||||
("object", object, "a"),
|
||||
("object", int, 1),
|
||||
("category", object, "a"),
|
||||
("category", int, 1),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_iterable_object_and_category(
|
||||
self, index_or_series, method, dtype, rdtype, obj
|
||||
):
|
||||
# gh-10904
|
||||
# gh-13258
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
s = typ([obj], dtype=dtype)
|
||||
result = method(s)[0]
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize("dtype, rdtype", dtypes)
|
||||
def test_iterable_items(self, dtype, rdtype):
|
||||
# gh-13258
|
||||
# test if items yields the correct boxed scalars
|
||||
# this only applies to series
|
||||
s = Series([1], dtype=dtype)
|
||||
_, result = next(iter(s.items()))
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
_, result = next(iter(s.items()))
|
||||
assert isinstance(result, rdtype)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype, rdtype", dtypes + [("object", int), ("category", int)]
|
||||
)
|
||||
def test_iterable_map(self, index_or_series, dtype, rdtype):
|
||||
# gh-13236
|
||||
# coerce iteration to underlying python / pandas types
|
||||
typ = index_or_series
|
||||
if dtype == "float16" and issubclass(typ, pd.Index):
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
typ([1], dtype=dtype)
|
||||
return
|
||||
s = typ([1], dtype=dtype)
|
||||
result = s.map(type)[0]
|
||||
if not isinstance(rdtype, tuple):
|
||||
rdtype = (rdtype,)
|
||||
assert result in rdtype
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method",
|
||||
[
|
||||
lambda x: x.tolist(),
|
||||
lambda x: x.to_list(),
|
||||
lambda x: list(x),
|
||||
lambda x: list(x.__iter__()),
|
||||
],
|
||||
ids=["tolist", "to_list", "list", "iter"],
|
||||
)
|
||||
def test_categorial_datetimelike(self, method):
|
||||
i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")])
|
||||
|
||||
result = method(i)[0]
|
||||
assert isinstance(result, Timestamp)
|
||||
|
||||
def test_iter_box_dt64(self, unit):
|
||||
vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
assert ser.dtype == f"datetime64[{unit}]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz is None
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_dt64tz(self, unit):
|
||||
vals = [
|
||||
Timestamp("2011-01-01", tz="US/Eastern"),
|
||||
Timestamp("2011-01-02", tz="US/Eastern"),
|
||||
]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
|
||||
assert ser.dtype == f"datetime64[{unit}, US/Eastern]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timestamp)
|
||||
assert res.tz == exp.tz
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_timedelta64(self, unit):
|
||||
# timedelta
|
||||
vals = [Timedelta("1 days"), Timedelta("2 days")]
|
||||
ser = Series(vals).dt.as_unit(unit)
|
||||
assert ser.dtype == f"timedelta64[{unit}]"
|
||||
for res, exp in zip(ser, vals):
|
||||
assert isinstance(res, Timedelta)
|
||||
assert res == exp
|
||||
assert res.unit == unit
|
||||
|
||||
def test_iter_box_period(self):
|
||||
# period
|
||||
vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
|
||||
s = Series(vals)
|
||||
assert s.dtype == "Period[M]"
|
||||
for res, exp in zip(s, vals):
|
||||
assert isinstance(res, pd.Period)
|
||||
assert res.freq == "ME"
|
||||
assert res == exp
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected_type, dtype",
|
||||
[
|
||||
(np.array([0, 1], dtype=np.int64), np.ndarray, "int64"),
|
||||
(np.array(["a", "b"]), np.ndarray, "object"),
|
||||
(pd.Categorical(["a", "b"]), pd.Categorical, "category"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"], tz="US/Central"),
|
||||
DatetimeArray,
|
||||
"datetime64[ns, US/Central]",
|
||||
),
|
||||
(
|
||||
pd.PeriodIndex([2018, 2019], freq="Y"),
|
||||
PeriodArray,
|
||||
pd.core.dtypes.dtypes.PeriodDtype("Y-DEC"),
|
||||
),
|
||||
(pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
|
||||
(
|
||||
pd.DatetimeIndex(["2017", "2018"]),
|
||||
DatetimeArray,
|
||||
"datetime64[ns]",
|
||||
),
|
||||
(
|
||||
pd.TimedeltaIndex([10**10]),
|
||||
TimedeltaArray,
|
||||
"m8[ns]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_values_consistent(arr, expected_type, dtype, using_infer_string):
|
||||
if using_infer_string and dtype == "object":
|
||||
expected_type = ArrowStringArrayNumpySemantics
|
||||
l_values = Series(arr)._values
|
||||
r_values = pd.Index(arr)._values
|
||||
assert type(l_values) is expected_type
|
||||
assert type(l_values) is type(r_values)
|
||||
|
||||
tm.assert_equal(l_values, r_values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arr", [np.array([1, 2, 3])])
|
||||
def test_numpy_array(arr):
|
||||
ser = Series(arr)
|
||||
result = ser.array
|
||||
expected = NumpyExtensionArray(arr)
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_numpy_array_all_dtypes(any_numpy_dtype):
|
||||
ser = Series(dtype=any_numpy_dtype)
|
||||
result = ser.array
|
||||
if np.dtype(any_numpy_dtype).kind == "M":
|
||||
assert isinstance(result, DatetimeArray)
|
||||
elif np.dtype(any_numpy_dtype).kind == "m":
|
||||
assert isinstance(result, TimedeltaArray)
|
||||
else:
|
||||
assert isinstance(result, NumpyExtensionArray)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, attr",
|
||||
[
|
||||
(pd.Categorical(["a", "b"]), "_codes"),
|
||||
(PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]"), "_ndarray"),
|
||||
(pd.array([0, np.nan], dtype="Int64"), "_data"),
|
||||
(IntervalArray.from_breaks([0, 1]), "_left"),
|
||||
(SparseArray([0, 1]), "_sparse_values"),
|
||||
(
|
||||
DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")),
|
||||
"_ndarray",
|
||||
),
|
||||
# tz-aware Datetime
|
||||
(
|
||||
DatetimeArray._from_sequence(
|
||||
np.array(
|
||||
["2000-01-01T12:00:00", "2000-01-02T12:00:00"], dtype="M8[ns]"
|
||||
),
|
||||
dtype=DatetimeTZDtype(tz="US/Central"),
|
||||
),
|
||||
"_ndarray",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_array(arr, attr, index_or_series, request):
|
||||
box = index_or_series
|
||||
|
||||
result = box(arr, copy=False).array
|
||||
|
||||
if attr:
|
||||
arr = getattr(arr, attr)
|
||||
result = getattr(result, attr)
|
||||
|
||||
assert result is arr
|
||||
|
||||
|
||||
def test_array_multiindex_raises():
|
||||
idx = pd.MultiIndex.from_product([["A"], ["a", "b"]])
|
||||
msg = "MultiIndex has no single backing array"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
idx.array
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arr, expected",
|
||||
[
|
||||
(np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
|
||||
(pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
|
||||
(
|
||||
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
|
||||
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
|
||||
),
|
||||
(pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
|
||||
(
|
||||
IntervalArray.from_breaks([0, 1, 2]),
|
||||
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
|
||||
),
|
||||
(SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
|
||||
# tz-naive datetime
|
||||
(
|
||||
DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
|
||||
np.array(["2000", "2001"], dtype="M8[ns]"),
|
||||
),
|
||||
# tz-aware stays tz`-aware
|
||||
(
|
||||
DatetimeArray._from_sequence(
|
||||
np.array(["2000-01-01T06:00:00", "2000-01-02T06:00:00"], dtype="M8[ns]")
|
||||
)
|
||||
.tz_localize("UTC")
|
||||
.tz_convert("US/Central"),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2000-01-01", tz="US/Central"),
|
||||
Timestamp("2000-01-02", tz="US/Central"),
|
||||
]
|
||||
),
|
||||
),
|
||||
# Timedelta
|
||||
(
|
||||
TimedeltaArray._from_sequence(
|
||||
np.array([0, 3600000000000], dtype="i8").view("m8[ns]")
|
||||
),
|
||||
np.array([0, 3600000000000], dtype="m8[ns]"),
|
||||
),
|
||||
# GH#26406 tz is preserved in Categorical[dt64tz]
|
||||
(
|
||||
pd.Categorical(date_range("2016-01-01", periods=2, tz="US/Pacific")),
|
||||
np.array(
|
||||
[
|
||||
Timestamp("2016-01-01", tz="US/Pacific"),
|
||||
Timestamp("2016-01-02", tz="US/Pacific"),
|
||||
]
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy(arr, expected, index_or_series_or_array, request):
|
||||
box = index_or_series_or_array
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
thing = box(arr)
|
||||
|
||||
result = thing.to_numpy()
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = np.asarray(thing)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)]
|
||||
)
|
||||
def test_to_numpy_copy(arr, as_series, using_infer_string):
|
||||
obj = pd.Index(arr, copy=False)
|
||||
if as_series:
|
||||
obj = Series(obj.values, copy=False)
|
||||
|
||||
# no copy by default
|
||||
result = obj.to_numpy()
|
||||
if using_infer_string and arr.dtype == object:
|
||||
assert np.shares_memory(arr, result) is False
|
||||
else:
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
result = obj.to_numpy(copy=False)
|
||||
if using_infer_string and arr.dtype == object:
|
||||
assert np.shares_memory(arr, result) is False
|
||||
else:
|
||||
assert np.shares_memory(arr, result) is True
|
||||
|
||||
# copy=True
|
||||
result = obj.to_numpy(copy=True)
|
||||
assert np.shares_memory(arr, result) is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_series", [True, False])
|
||||
def test_to_numpy_dtype(as_series, unit):
|
||||
tz = "US/Eastern"
|
||||
obj = pd.DatetimeIndex(["2000", "2001"], tz=tz)
|
||||
if as_series:
|
||||
obj = Series(obj)
|
||||
|
||||
# preserve tz by default
|
||||
result = obj.to_numpy()
|
||||
expected = np.array(
|
||||
[Timestamp("2000", tz=tz), Timestamp("2001", tz=tz)], dtype=object
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="object")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
result = obj.to_numpy(dtype="M8[ns]")
|
||||
expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]")
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values, dtype, na_value, expected",
|
||||
[
|
||||
([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]),
|
||||
(
|
||||
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
|
||||
None,
|
||||
Timestamp("2000"),
|
||||
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_na_value_numpy_dtype(
|
||||
index_or_series, values, dtype, na_value, expected
|
||||
):
|
||||
obj = index_or_series(values)
|
||||
result = obj.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array(expected)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, multiindex, dtype, na_value, expected",
|
||||
[
|
||||
(
|
||||
[1, 2, None, 4],
|
||||
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
|
||||
float,
|
||||
None,
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
),
|
||||
(
|
||||
[1, 2, None, 4],
|
||||
[(0, "a"), (0, "b"), (1, "b"), (1, "c")],
|
||||
float,
|
||||
np.nan,
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
),
|
||||
(
|
||||
[1.0, 2.0, np.nan, 4.0],
|
||||
[("a", 0), ("a", 1), ("a", 2), ("b", 0)],
|
||||
int,
|
||||
0,
|
||||
[1, 2, 0, 4],
|
||||
),
|
||||
(
|
||||
[Timestamp("2000"), Timestamp("2000"), pd.NaT],
|
||||
[(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))],
|
||||
None,
|
||||
Timestamp("2000"),
|
||||
[np.datetime64("2000-01-01T00:00:00.000000000")] * 3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_multiindex_series_na_value(
|
||||
data, multiindex, dtype, na_value, expected
|
||||
):
|
||||
index = pd.MultiIndex.from_tuples(multiindex)
|
||||
series = Series(data, index=index)
|
||||
result = series.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array(expected)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_kwargs_raises():
|
||||
# numpy
|
||||
s = Series([1, 2, 3])
|
||||
msg = r"to_numpy\(\) got an unexpected keyword argument 'foo'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
# extension
|
||||
s = Series([1, 2, 3], dtype="Int64")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_numpy(foo=True)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
{"a": [1, 2, 3], "b": [1, 2, None]},
|
||||
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
|
||||
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
|
||||
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=dtype, na_value=na_value)
|
||||
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected",
|
||||
[
|
||||
(
|
||||
{"a": pd.array([1, 2, None])},
|
||||
np.array([[1.0], [2.0], [np.nan]], dtype=float),
|
||||
),
|
||||
(
|
||||
{"a": [1, 2, 3], "b": [1, 2, 3]},
|
||||
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_to_numpy_dataframe_single_block(data, expected):
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
df = pd.DataFrame(data)
|
||||
result = df.to_numpy(dtype=float, na_value=np.nan)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_numpy_dataframe_single_block_no_mutate():
|
||||
# https://github.com/pandas-dev/pandas/issues/33820
|
||||
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
|
||||
result.to_numpy(na_value=0.0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestAsArray:
|
||||
@pytest.mark.parametrize("tz", [None, "US/Central"])
|
||||
def test_asarray_object_dt64(self, tz):
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
# Future behavior (for tzaware case) with no warning
|
||||
result = np.asarray(ser, dtype=object)
|
||||
|
||||
expected = np.array(
|
||||
[Timestamp("2000-01-01", tz=tz), Timestamp("2000-01-02", tz=tz)]
|
||||
)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_naive(self):
|
||||
# This shouldn't produce a warning.
|
||||
ser = Series(date_range("2000", periods=2))
|
||||
expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
|
||||
result = np.asarray(ser)
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
def test_asarray_tz_aware(self):
|
||||
tz = "US/Central"
|
||||
ser = Series(date_range("2000", periods=2, tz=tz))
|
||||
expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]")
|
||||
result = np.asarray(ser, dtype="datetime64[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
# Old behavior with no warning
|
||||
result = np.asarray(ser, dtype="M8[ns]")
|
||||
|
||||
tm.assert_numpy_array_equal(result, expected)
|
@ -0,0 +1,60 @@
|
||||
"""
|
||||
Though Index.fillna and Series.fillna has separate impl,
|
||||
test here to confirm these works as the same
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import MultiIndex
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
def test_fillna(index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, MultiIndex):
|
||||
msg = "isna is not defined for MultiIndex"
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
obj.fillna(0)
|
||||
return
|
||||
|
||||
# values will not be changed
|
||||
fill_value = obj.values[0] if len(obj) > 0 else 0
|
||||
result = obj.fillna(fill_value)
|
||||
|
||||
tm.assert_equal(obj, result)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_fillna_null(null_obj, index_or_series_obj):
|
||||
# GH 11343
|
||||
obj = index_or_series_obj
|
||||
klass = type(obj)
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip(f"{klass} doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
fill_value = values[0]
|
||||
expected = values.copy()
|
||||
values[0:2] = null_obj
|
||||
expected[0:2] = fill_value
|
||||
|
||||
expected = klass(expected)
|
||||
obj = klass(values)
|
||||
|
||||
result = obj.fillna(fill_value)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# check shallow_copied
|
||||
assert obj is not result
|
191
venv/lib/python3.12/site-packages/pandas/tests/base/test_misc.py
Normal file
191
venv/lib/python3.12/site-packages/pandas/tests/base/test_misc.py
Normal file
@ -0,0 +1,191 @@
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
from pandas.compat import PYPY
|
||||
|
||||
from pandas.core.dtypes.common import (
|
||||
is_dtype_equal,
|
||||
is_object_dtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_isnull_notnull_docstrings():
|
||||
# GH#41855 make sure its clear these are aliases
|
||||
doc = pd.DataFrame.notnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.notnull is an alias for DataFrame.notna.\n")
|
||||
doc = pd.DataFrame.isnull.__doc__
|
||||
assert doc.startswith("\nDataFrame.isnull is an alias for DataFrame.isna.\n")
|
||||
|
||||
doc = Series.notnull.__doc__
|
||||
assert doc.startswith("\nSeries.notnull is an alias for Series.notna.\n")
|
||||
doc = Series.isnull.__doc__
|
||||
assert doc.startswith("\nSeries.isnull is an alias for Series.isna.\n")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op_name, op",
|
||||
[
|
||||
("add", "+"),
|
||||
("sub", "-"),
|
||||
("mul", "*"),
|
||||
("mod", "%"),
|
||||
("pow", "**"),
|
||||
("truediv", "/"),
|
||||
("floordiv", "//"),
|
||||
],
|
||||
)
|
||||
def test_binary_ops_docstring(frame_or_series, op_name, op):
|
||||
# not using the all_arithmetic_functions fixture with _get_opstr
|
||||
# as _get_opstr is used internally in the dynamic implementation of the docstring
|
||||
klass = frame_or_series
|
||||
|
||||
operand1 = klass.__name__.lower()
|
||||
operand2 = "other"
|
||||
expected_str = " ".join([operand1, op, operand2])
|
||||
assert expected_str in getattr(klass, op_name).__doc__
|
||||
|
||||
# reverse version of the binary ops
|
||||
expected_str = " ".join([operand2, op, operand1])
|
||||
assert expected_str in getattr(klass, "r" + op_name).__doc__
|
||||
|
||||
|
||||
def test_ndarray_compat_properties(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
# Check that we work.
|
||||
for p in ["shape", "dtype", "T", "nbytes"]:
|
||||
assert getattr(obj, p, None) is not None
|
||||
|
||||
# deprecated properties
|
||||
for p in ["strides", "itemsize", "base", "data"]:
|
||||
assert not hasattr(obj, p)
|
||||
|
||||
msg = "can only convert an array of size 1 to a Python scalar"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.item() # len > 1
|
||||
|
||||
assert obj.ndim == 1
|
||||
assert obj.size == len(obj)
|
||||
|
||||
assert Index([1]).item() == 1
|
||||
assert Series([1]).item() == 1
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
PYPY or using_pyarrow_string_dtype(),
|
||||
reason="not relevant for PyPy doesn't work properly for arrow strings",
|
||||
)
|
||||
def test_memory_usage(index_or_series_memory_obj):
|
||||
obj = index_or_series_memory_obj
|
||||
# Clear index caches so that len(obj) == 0 report 0 memory usage
|
||||
if isinstance(obj, Series):
|
||||
is_ser = True
|
||||
obj.index._engine.clear_mapping()
|
||||
else:
|
||||
is_ser = False
|
||||
obj._engine.clear_mapping()
|
||||
|
||||
res = obj.memory_usage()
|
||||
res_deep = obj.memory_usage(deep=True)
|
||||
|
||||
is_object = is_object_dtype(obj) or (is_ser and is_object_dtype(obj.index))
|
||||
is_categorical = isinstance(obj.dtype, pd.CategoricalDtype) or (
|
||||
is_ser and isinstance(obj.index.dtype, pd.CategoricalDtype)
|
||||
)
|
||||
is_object_string = is_dtype_equal(obj, "string[python]") or (
|
||||
is_ser and is_dtype_equal(obj.index.dtype, "string[python]")
|
||||
)
|
||||
|
||||
if len(obj) == 0:
|
||||
expected = 0
|
||||
assert res_deep == res == expected
|
||||
elif is_object or is_categorical or is_object_string:
|
||||
# only deep will pick them up
|
||||
assert res_deep > res
|
||||
else:
|
||||
assert res == res_deep
|
||||
|
||||
# sys.getsizeof will call the .memory_usage with
|
||||
# deep=True, and add on some GC overhead
|
||||
diff = res_deep - sys.getsizeof(obj)
|
||||
assert abs(diff) < 100
|
||||
|
||||
|
||||
def test_memory_usage_components_series(series_with_simple_index):
|
||||
series = series_with_simple_index
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", tm.NARROW_NP_DTYPES)
|
||||
def test_memory_usage_components_narrow_series(dtype):
|
||||
series = Series(range(5), dtype=dtype, index=[f"i-{i}" for i in range(5)], name="a")
|
||||
total_usage = series.memory_usage(index=True)
|
||||
non_index_usage = series.memory_usage(index=False)
|
||||
index_usage = series.index.memory_usage()
|
||||
assert total_usage == non_index_usage + index_usage
|
||||
|
||||
|
||||
def test_searchsorted(request, index_or_series_obj):
|
||||
# numpy.searchsorted calls obj.searchsorted under the hood.
|
||||
# See gh-12238
|
||||
obj = index_or_series_obj
|
||||
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
# See gh-14833
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="np.searchsorted doesn't work on pd.MultiIndex: GH 14833"
|
||||
)
|
||||
)
|
||||
elif obj.dtype.kind == "c" and isinstance(obj, Index):
|
||||
# TODO: Should Series cases also raise? Looks like they use numpy
|
||||
# comparison semantics https://github.com/numpy/numpy/issues/15981
|
||||
mark = pytest.mark.xfail(reason="complex objects are not comparable")
|
||||
request.applymarker(mark)
|
||||
|
||||
max_obj = max(obj, default=0)
|
||||
index = np.searchsorted(obj, max_obj)
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
index = np.searchsorted(obj, max_obj, sorter=range(len(obj)))
|
||||
assert 0 <= index <= len(obj)
|
||||
|
||||
|
||||
def test_access_by_position(index_flat):
|
||||
index = index_flat
|
||||
|
||||
if len(index) == 0:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
|
||||
series = Series(index)
|
||||
assert index[0] == series.iloc[0]
|
||||
assert index[5] == series.iloc[5]
|
||||
assert index[-1] == series.iloc[-1]
|
||||
|
||||
size = len(index)
|
||||
assert index[-1] == index[size - 1]
|
||||
|
||||
msg = f"index {size} is out of bounds for axis 0 with size {size}"
|
||||
if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
|
||||
index.dtype, "string[pyarrow_numpy]"
|
||||
):
|
||||
msg = "index out of bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
index[size]
|
||||
msg = "single positional indexer is out-of-bounds"
|
||||
with pytest.raises(IndexError, match=msg):
|
||||
series.iloc[size]
|
@ -0,0 +1,56 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_transpose(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(obj.transpose(), obj)
|
||||
|
||||
|
||||
def test_transpose_non_default_axes(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(1)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.transpose(axes=1)
|
||||
|
||||
|
||||
def test_numpy_transpose(index_or_series_obj):
|
||||
msg = "the 'axes' parameter is not supported"
|
||||
obj = index_or_series_obj
|
||||
tm.assert_equal(np.transpose(obj), obj)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
np.transpose(obj, axes=1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, transposed_data, index, columns, dtype",
|
||||
[
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int),
|
||||
([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int),
|
||||
([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])),
|
||||
([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int),
|
||||
(
|
||||
[[1, 2], [3, 4]],
|
||||
[[1, 3], [2, 4]],
|
||||
["a", "a"],
|
||||
["b", "b"],
|
||||
CategoricalDtype([1, 2, 3, 4]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_duplicate_labels(data, transposed_data, index, columns, dtype):
|
||||
# GH 42380
|
||||
df = DataFrame(data, index=index, columns=columns, dtype=dtype)
|
||||
result = df.T
|
||||
expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,124 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._config import using_pyarrow_string_dtype
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_unique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.unique()
|
||||
|
||||
# dict.fromkeys preserves the order
|
||||
unique_values = list(dict.fromkeys(obj.values))
|
||||
if isinstance(obj, pd.MultiIndex):
|
||||
expected = pd.MultiIndex.from_tuples(unique_values)
|
||||
expected.names = obj.names
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
elif isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if isinstance(obj.dtype, pd.DatetimeTZDtype):
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_unique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
result = obj.unique()
|
||||
|
||||
unique_values_raw = dict.fromkeys(obj.values)
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
|
||||
unique_values = [null_obj] + unique_values_not_null
|
||||
|
||||
if isinstance(obj, pd.Index):
|
||||
expected = pd.Index(unique_values, dtype=obj.dtype)
|
||||
if isinstance(obj.dtype, pd.DatetimeTZDtype):
|
||||
result = result.normalize()
|
||||
expected = expected.normalize()
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(unique_values, dtype=obj.dtype)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
def test_nunique(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
expected = len(obj.unique())
|
||||
assert obj.nunique(dropna=False) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
def test_nunique_null(null_obj, index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif isinstance(obj, pd.MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
if isinstance(obj, pd.CategoricalIndex):
|
||||
assert obj.nunique() == len(obj.categories)
|
||||
assert obj.nunique(dropna=False) == len(obj.categories) + 1
|
||||
else:
|
||||
num_unique_values = len(obj.unique())
|
||||
assert obj.nunique() == max(0, num_unique_values - 1)
|
||||
assert obj.nunique(dropna=False) == max(0, num_unique_values)
|
||||
|
||||
|
||||
@pytest.mark.single_cpu
|
||||
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails")
|
||||
def test_unique_bad_unicode(index_or_series):
|
||||
# regression test for #34550
|
||||
uval = "\ud83d" # smiley emoji
|
||||
|
||||
obj = index_or_series([uval] * 2)
|
||||
result = obj.unique()
|
||||
|
||||
if isinstance(obj, pd.Index):
|
||||
expected = pd.Index(["\ud83d"], dtype=object)
|
||||
tm.assert_index_equal(result, expected, exact=True)
|
||||
else:
|
||||
expected = np.array(["\ud83d"], dtype=object)
|
||||
tm.assert_numpy_array_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_nunique_dropna(dropna):
|
||||
# GH37566
|
||||
ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
|
||||
res = ser.nunique(dropna)
|
||||
assert res == 1 if dropna else 5
|
@ -0,0 +1,356 @@
|
||||
import collections
|
||||
from datetime import timedelta
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DatetimeIndex,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalIndex,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timedelta,
|
||||
TimedeltaIndex,
|
||||
array,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.base.common import allow_na_ops
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_value_counts(index_or_series_obj):
|
||||
obj = index_or_series_obj
|
||||
obj = np.repeat(obj, range(1, len(obj) + 1))
|
||||
result = obj.value_counts()
|
||||
|
||||
counter = collections.Counter(obj)
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
|
||||
|
||||
if obj.dtype != np.float16:
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
expected.index.astype(obj.dtype)
|
||||
return
|
||||
if isinstance(expected.index, MultiIndex):
|
||||
expected.index.names = obj.names
|
||||
else:
|
||||
expected.index.name = obj.name
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
if getattr(obj.dtype, "storage", "") == "pyarrow":
|
||||
expected = expected.astype("int64[pyarrow]")
|
||||
else:
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
|
||||
# TODO(GH#32514): Order of entries with the same count is inconsistent
|
||||
# on CI (gh-32449)
|
||||
if obj.duplicated().any():
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("null_obj", [np.nan, None])
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_value_counts_null(null_obj, index_or_series_obj):
|
||||
orig = index_or_series_obj
|
||||
obj = orig.copy()
|
||||
|
||||
if not allow_na_ops(obj):
|
||||
pytest.skip("type doesn't allow for NA operations")
|
||||
elif len(obj) < 1:
|
||||
pytest.skip("Test doesn't make sense on empty data")
|
||||
elif isinstance(orig, MultiIndex):
|
||||
pytest.skip(f"MultiIndex can't hold '{null_obj}'")
|
||||
|
||||
values = obj._values
|
||||
values[0:2] = null_obj
|
||||
|
||||
klass = type(obj)
|
||||
repeated_values = np.repeat(values, range(1, len(values) + 1))
|
||||
obj = klass(repeated_values, dtype=obj.dtype)
|
||||
|
||||
# because np.nan == np.nan is False, but None == None is True
|
||||
# np.nan would be duplicated, whereas None wouldn't
|
||||
counter = collections.Counter(obj.dropna())
|
||||
expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
|
||||
|
||||
if obj.dtype != np.float16:
|
||||
expected.index = expected.index.astype(obj.dtype)
|
||||
else:
|
||||
with pytest.raises(NotImplementedError, match="float16 indexes are not "):
|
||||
expected.index.astype(obj.dtype)
|
||||
return
|
||||
expected.index.name = obj.name
|
||||
|
||||
result = obj.value_counts()
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
|
||||
if not isinstance(result.dtype, np.dtype):
|
||||
if getattr(obj.dtype, "storage", "") == "pyarrow":
|
||||
expected = expected.astype("int64[pyarrow]")
|
||||
else:
|
||||
# i.e IntegerDtype
|
||||
expected = expected.astype("Int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected[null_obj] = 3
|
||||
|
||||
result = obj.value_counts(dropna=False)
|
||||
if obj.duplicated().any():
|
||||
# TODO(GH#32514):
|
||||
# Order of entries with the same count is inconsistent on CI (gh-32449)
|
||||
expected = expected.sort_index()
|
||||
result = result.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_value_counts_inferred(index_or_series, using_infer_string):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.unique(np.array(s_values, dtype=np.object_))
|
||||
if using_infer_string:
|
||||
exp = array(exp)
|
||||
tm.assert_equal(s.unique(), exp)
|
||||
|
||||
assert s.nunique() == 4
|
||||
# don't sort, have to sort after the fact as not sorting is
|
||||
# platform-dep
|
||||
hist = s.value_counts(sort=False).sort_values()
|
||||
expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# sort ascending
|
||||
hist = s.value_counts(ascending=True)
|
||||
expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
# relative histogram.
|
||||
hist = s.value_counts(normalize=True)
|
||||
expected = Series(
|
||||
[0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
|
||||
)
|
||||
tm.assert_series_equal(hist, expected)
|
||||
|
||||
|
||||
def test_value_counts_bins(index_or_series, using_infer_string):
|
||||
klass = index_or_series
|
||||
s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
|
||||
s = klass(s_values)
|
||||
|
||||
# bins
|
||||
msg = "bins argument only works with numeric data"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.value_counts(bins=1)
|
||||
|
||||
s1 = Series([1, 1, 2, 3])
|
||||
res1 = s1.value_counts(bins=1)
|
||||
exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
|
||||
tm.assert_series_equal(res1, exp1)
|
||||
res1n = s1.value_counts(bins=1, normalize=True)
|
||||
exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
|
||||
tm.assert_series_equal(res1n, exp1n)
|
||||
|
||||
if isinstance(s1, Index):
|
||||
tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
|
||||
else:
|
||||
exp = np.array([1, 2, 3], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(s1.unique(), exp)
|
||||
|
||||
assert s1.nunique() == 3
|
||||
|
||||
# these return the same
|
||||
res4 = s1.value_counts(bins=4, dropna=True)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4 = s1.value_counts(bins=4, dropna=False)
|
||||
intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
|
||||
exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
|
||||
tm.assert_series_equal(res4, exp4)
|
||||
|
||||
res4n = s1.value_counts(bins=4, normalize=True)
|
||||
exp4n = Series(
|
||||
[0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
|
||||
)
|
||||
tm.assert_series_equal(res4n, exp4n)
|
||||
|
||||
# handle NA's properly
|
||||
s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
|
||||
s = klass(s_values)
|
||||
expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected)
|
||||
|
||||
if isinstance(s, Index):
|
||||
exp = Index(["a", "b", np.nan, "d"])
|
||||
tm.assert_index_equal(s.unique(), exp)
|
||||
else:
|
||||
exp = np.array(["a", "b", np.nan, "d"], dtype=object)
|
||||
if using_infer_string:
|
||||
exp = array(exp)
|
||||
tm.assert_equal(s.unique(), exp)
|
||||
assert s.nunique() == 3
|
||||
|
||||
s = klass({}) if klass is dict else klass({}, dtype=object)
|
||||
expected = Series([], dtype=np.int64, name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
|
||||
# returned dtype differs depending on original
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(s.unique(), Index([]), exact=False)
|
||||
else:
|
||||
tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
|
||||
|
||||
assert s.nunique() == 0
|
||||
|
||||
|
||||
def test_value_counts_datetime64(index_or_series, unit):
|
||||
klass = index_or_series
|
||||
|
||||
# GH 3002, datetime64[ns]
|
||||
# don't test names though
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
|
||||
"dt": pd.to_datetime(
|
||||
[
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2010-01-01",
|
||||
"2009-01-01",
|
||||
"2008-09-09",
|
||||
"2008-09-09",
|
||||
]
|
||||
).as_unit(unit),
|
||||
"food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
|
||||
}
|
||||
)
|
||||
|
||||
s = klass(df["dt"].copy())
|
||||
s.name = None
|
||||
idx = pd.to_datetime(
|
||||
["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
|
||||
).as_unit(unit)
|
||||
expected_s = Series([3, 2, 1], index=idx, name="count")
|
||||
tm.assert_series_equal(s.value_counts(), expected_s)
|
||||
|
||||
expected = array(
|
||||
np.array(
|
||||
["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
|
||||
dtype=f"datetime64[{unit}]",
|
||||
)
|
||||
)
|
||||
result = s.unique()
|
||||
if isinstance(s, Index):
|
||||
tm.assert_index_equal(result, DatetimeIndex(expected))
|
||||
else:
|
||||
tm.assert_extension_array_equal(result, expected)
|
||||
|
||||
assert s.nunique() == 3
|
||||
|
||||
# with NaT
|
||||
s = df["dt"].copy()
|
||||
s = klass(list(s.values) + [pd.NaT] * 4)
|
||||
if klass is Series:
|
||||
s = s.dt.as_unit(unit)
|
||||
else:
|
||||
s = s.as_unit(unit)
|
||||
|
||||
result = s.value_counts()
|
||||
assert result.index.dtype == f"datetime64[{unit}]"
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
result = s.value_counts(dropna=False)
|
||||
expected_s = pd.concat(
|
||||
[
|
||||
Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"),
|
||||
expected_s,
|
||||
]
|
||||
)
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
assert s.dtype == f"datetime64[{unit}]"
|
||||
unique = s.unique()
|
||||
assert unique.dtype == f"datetime64[{unit}]"
|
||||
|
||||
# numpy_array_equal cannot compare pd.NaT
|
||||
if isinstance(s, Index):
|
||||
exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit)
|
||||
tm.assert_index_equal(unique, exp_idx)
|
||||
else:
|
||||
tm.assert_extension_array_equal(unique[:3], expected)
|
||||
assert pd.isna(unique[3])
|
||||
|
||||
assert s.nunique() == 3
|
||||
assert s.nunique(dropna=False) == 4
|
||||
|
||||
|
||||
def test_value_counts_timedelta64(index_or_series, unit):
|
||||
# timedelta64[ns]
|
||||
klass = index_or_series
|
||||
|
||||
day = Timedelta(timedelta(1)).as_unit(unit)
|
||||
tdi = TimedeltaIndex([day], name="dt").as_unit(unit)
|
||||
|
||||
tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day
|
||||
td = klass(tdvals, name="dt")
|
||||
|
||||
result = td.value_counts()
|
||||
expected_s = Series([6], index=tdi, name="count")
|
||||
tm.assert_series_equal(result, expected_s)
|
||||
|
||||
expected = tdi
|
||||
result = td.unique()
|
||||
if isinstance(td, Index):
|
||||
tm.assert_index_equal(result, expected)
|
||||
else:
|
||||
tm.assert_extension_array_equal(result, expected._values)
|
||||
|
||||
td2 = day + np.zeros(6, dtype=f"m8[{unit}]")
|
||||
td2 = klass(td2, name="dt")
|
||||
result2 = td2.value_counts()
|
||||
tm.assert_series_equal(result2, expected_s)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [True, False])
|
||||
def test_value_counts_with_nan(dropna, index_or_series):
|
||||
# GH31944
|
||||
klass = index_or_series
|
||||
values = [True, pd.NA, np.nan]
|
||||
obj = klass(values)
|
||||
res = obj.value_counts(dropna=dropna)
|
||||
if dropna is True:
|
||||
expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
|
||||
else:
|
||||
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
|
||||
tm.assert_series_equal(res, expected)
|
||||
|
||||
|
||||
def test_value_counts_object_inference_deprecated():
|
||||
# GH#56161
|
||||
dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
|
||||
|
||||
idx = dti.astype(object)
|
||||
msg = "The behavior of value_counts with object-dtype is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = idx.value_counts()
|
||||
|
||||
exp = dti.value_counts()
|
||||
tm.assert_series_equal(res, exp)
|
Reference in New Issue
Block a user