This commit is contained in:
2024-12-04 13:35:57 +05:00
parent d346bf4b2a
commit 73ce681a55
7059 changed files with 1196501 additions and 0 deletions

View File

@ -0,0 +1,25 @@
def get_groupby_method_args(name, obj):
"""
Get required arguments for a groupby method.
When parametrizing a test over groupby methods (e.g. "sum", "mean", "fillna"),
it is often the case that arguments are required for certain methods.
Parameters
----------
name: str
Name of the method.
obj: Series or DataFrame
pandas object that is being grouped.
Returns
-------
A tuple of required arguments for the method.
"""
if name in ("nth", "fillna", "take"):
return (0,)
if name == "quantile":
return (0.5,)
if name == "corrwith":
return (obj,)
return ()

View File

@ -0,0 +1,435 @@
"""
test cython .agg behavior
"""
import numpy as np
import pytest
from pandas.core.dtypes.common import (
is_float_dtype,
is_integer_dtype,
)
import pandas as pd
from pandas import (
DataFrame,
Index,
NaT,
Series,
Timedelta,
Timestamp,
bdate_range,
)
import pandas._testing as tm
import pandas.core.common as com
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
pytest.param(
"median",
# ignore mean of empty slice
# and all-NaN
marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
),
"prod",
"min",
"max",
],
)
def test_cythonized_aggers(op_name):
data = {
"A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
"B": ["A", "B"] * 6,
"C": np.random.default_rng(2).standard_normal(12),
}
df = DataFrame(data)
df.loc[2:10:2, "C"] = np.nan
op = lambda x: getattr(x, op_name)()
# single column
grouped = df.drop(["B"], axis=1).groupby("A")
exp = {cat: op(group["C"]) for cat, group in grouped}
exp = DataFrame({"C": exp})
exp.index.name = "A"
result = op(grouped)
tm.assert_frame_equal(result, exp)
# multiple columns
grouped = df.groupby(["A", "B"])
expd = {}
for (cat1, cat2), group in grouped:
expd.setdefault(cat1, {})[cat2] = op(group["C"])
exp = DataFrame(expd).T.stack(future_stack=True)
exp.index.names = ["A", "B"]
exp.name = "C"
result = op(grouped)["C"]
if op_name in ["sum", "prod"]:
tm.assert_series_equal(result, exp)
def test_cython_agg_boolean():
frame = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 5, 50),
"b": np.random.default_rng(2).integers(0, 2, 50).astype("bool"),
}
)
result = frame.groupby("a")["b"].mean()
msg = "using SeriesGroupBy.mean"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
expected = frame.groupby("a")["b"].agg(np.mean)
tm.assert_series_equal(result, expected)
def test_cython_agg_nothing_to_agg():
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
with pytest.raises(TypeError, match=msg):
frame.groupby("a")["b"].mean(numeric_only=True)
frame = DataFrame(
{"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
)
result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
expected = DataFrame(
[], index=frame["a"].sort_values().drop_duplicates(), columns=[]
)
tm.assert_frame_equal(result, expected)
def test_cython_agg_nothing_to_agg_with_dates():
frame = DataFrame(
{
"a": np.random.default_rng(2).integers(0, 5, 50),
"b": ["foo", "bar"] * 25,
"dates": pd.date_range("now", periods=50, freq="min"),
}
)
msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
with pytest.raises(TypeError, match=msg):
frame.groupby("b").dates.mean(numeric_only=True)
def test_cython_agg_frame_columns():
# #2113
df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby(level=0, axis="columns").mean()
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby(level=0, axis="columns").mean()
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby(level=0, axis="columns").mean()
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby(level=0, axis="columns").mean()
def test_cython_agg_return_dict():
# GH 16741
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
expected = Series(
[{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
index=Index(["bar", "foo"], name="A"),
name="B",
)
tm.assert_series_equal(ts, expected)
def test_cython_fail_agg():
dr = bdate_range("1/1/2000", periods=50)
ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
grouped = ts.groupby(lambda x: x.month)
summed = grouped.sum()
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#53425
expected = grouped.agg(np.sum)
tm.assert_series_equal(summed, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", np.median),
("var", np.var),
("sum", np.sum),
("prod", np.prod),
("min", np.min),
("max", np.max),
("first", lambda x: x.iloc[0]),
("last", lambda x: x.iloc[-1]),
],
)
def test__cython_agg_general(op, targop):
df = DataFrame(np.random.default_rng(2).standard_normal(1000))
labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
warn = FutureWarning if targop in com._cython_table else None
msg = f"using DataFrameGroupBy.{op}"
with tm.assert_produces_warning(warn, match=msg):
# GH#53425
expected = df.groupby(labels).agg(targop)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, targop",
[
("mean", np.mean),
("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
("var", lambda x: np.var(x, ddof=1)),
("min", np.min),
("max", np.max),
],
)
def test_cython_agg_empty_buckets(op, targop, observed):
df = DataFrame([11, 12, 13])
grps = range(0, 55, 5)
# calling _cython_agg_general directly, instead of via the user API
# which sets different values for min_count, so do that here.
g = df.groupby(pd.cut(df[0], grps), observed=observed)
result = g._cython_agg_general(op, alt=None, numeric_only=True)
g = df.groupby(pd.cut(df[0], grps), observed=observed)
expected = g.agg(lambda x: targop(x))
tm.assert_frame_equal(result, expected)
def test_cython_agg_empty_buckets_nanops(observed):
# GH-18869 can't call nanops on empty groups, so hardcode expected
# for these
df = DataFrame([11, 12, 13], columns=["a"])
grps = np.arange(0, 25, 5, dtype=int)
# add / sum
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"sum", alt=None, numeric_only=True
)
intervals = pd.interval_range(0, 20, freq=5)
expected = DataFrame(
{"a": [0, 0, 36, 0]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 0]
tm.assert_frame_equal(result, expected)
# prod
result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
"prod", alt=None, numeric_only=True
)
expected = DataFrame(
{"a": [1, 1, 1716, 1]},
index=pd.CategoricalIndex(intervals, name="a", ordered=True),
)
if observed:
expected = expected[expected.a != 1]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("op", ["first", "last", "max", "min"])
@pytest.mark.parametrize(
"data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
)
def test_cython_with_timestamp_and_nat(op, data):
# https://github.com/pandas-dev/pandas/issues/19526
df = DataFrame({"a": [0, 1], "b": [data, NaT]})
index = Index([0, 1], name="a")
# We will group by a and test the cython aggregations
expected = DataFrame({"b": [data, NaT]}, index=index)
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"agg",
[
"min",
"max",
"count",
"sum",
"prod",
"var",
"mean",
"median",
"ohlc",
"cumprod",
"cumsum",
"shift",
"any",
"all",
"quantile",
"first",
"last",
"rank",
"cummin",
"cummax",
],
)
def test_read_only_buffer_source_agg(agg):
# https://github.com/pandas-dev/pandas/issues/36014
df = DataFrame(
{
"sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
"species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
}
)
df._mgr.arrays[0].flags.writeable = False
result = df.groupby(["species"]).agg({"sepal_length": agg})
expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"op_name",
[
"count",
"sum",
"std",
"var",
"sem",
"mean",
"median",
"prod",
"min",
"max",
],
)
def test_cython_agg_nullable_int(op_name):
# ensure that the cython-based aggregations don't fail for nullable dtype
# (eg https://github.com/pandas-dev/pandas/issues/37415)
df = DataFrame(
{
"A": ["A", "B"] * 5,
"B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
}
)
result = getattr(df.groupby("A")["B"], op_name)()
df2 = df.assign(B=df["B"].astype("float64"))
expected = getattr(df2.groupby("A")["B"], op_name)()
if op_name in ("mean", "median"):
convert_integer = False
else:
convert_integer = True
expected = expected.convert_dtypes(convert_integer=convert_integer)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_count_masked_returns_masked_dtype(dtype):
df = DataFrame(
{
"A": [1, 1],
"B": pd.array([1, pd.NA], dtype=dtype),
"C": pd.array([1, 1], dtype=dtype),
}
)
result = df.groupby("A").count()
expected = DataFrame(
[[1, 2]], index=Index([1], name="A"), columns=["B", "C"], dtype="Int64"
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("with_na", [True, False])
@pytest.mark.parametrize(
"op_name, action",
[
# ("count", "always_int"),
("sum", "large_int"),
# ("std", "always_float"),
("var", "always_float"),
# ("sem", "always_float"),
("mean", "always_float"),
("median", "always_float"),
("prod", "large_int"),
("min", "preserve"),
("max", "preserve"),
("first", "preserve"),
("last", "preserve"),
],
)
@pytest.mark.parametrize(
"data",
[
pd.array([1, 2, 3, 4], dtype="Int64"),
pd.array([1, 2, 3, 4], dtype="Int8"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
pd.array([True, True, False, False], dtype="boolean"),
],
)
def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
if with_na:
data[3] = pd.NA
df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
grouped = df.groupby("key")
if action == "always_int":
# always Int64
expected_dtype = pd.Int64Dtype()
elif action == "large_int":
# for any int/bool use Int64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
elif is_integer_dtype(data.dtype):
# match the numpy dtype we'd get with the non-nullable analogue
expected_dtype = data.dtype
else:
expected_dtype = pd.Int64Dtype()
elif action == "always_float":
# for any int/bool use Float64, for float preserve dtype
if is_float_dtype(data.dtype):
expected_dtype = data.dtype
else:
expected_dtype = pd.Float64Dtype()
elif action == "preserve":
expected_dtype = data.dtype
result = getattr(grouped, op_name)()
assert result["col"].dtype == expected_dtype
result = grouped.aggregate(op_name)
assert result["col"].dtype == expected_dtype
result = getattr(grouped["col"], op_name)()
assert result.dtype == expected_dtype
result = grouped["col"].aggregate(op_name)
assert result.dtype == expected_dtype

View File

@ -0,0 +1,392 @@
import numpy as np
import pytest
from pandas.errors import NumbaUtilError
from pandas import (
DataFrame,
Index,
NamedAgg,
Series,
option_context,
)
import pandas._testing as tm
pytestmark = pytest.mark.single_cpu
def test_correct_function_signature():
pytest.importorskip("numba")
def incorrect_function(x):
return sum(x) * 2.7
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").agg(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba")
def test_check_nopython_kwargs():
pytest.importorskip("numba")
def incorrect_function(values, index):
return sum(values) * 2.7
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").agg(incorrect_function, engine="numba", a=1)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
@pytest.mark.parametrize("as_index", [True, False])
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
pytest.importorskip("numba")
def func_numba(values, index):
return np.mean(values) * 2.7
if jit:
# Test accepted jitted functions
import numba
func_numba = numba.jit(func_numba)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0, as_index=as_index)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.agg(func_numba, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
pytest.importorskip("numba")
def func_1(values, index):
return np.mean(values) - 3.4
def func_2(values, index):
return np.mean(values) * 2.7
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
# Add func_2 to the cache
result = grouped.agg(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) * 2.7, engine="cython")
tm.assert_equal(result, expected)
# Retest func_1 which should use the cache
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
def test_use_global_config():
pytest.importorskip("numba")
def func_1(values, index):
return np.mean(values) - 3.4
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.agg(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.agg(func_1, engine=None)
tm.assert_frame_equal(expected, result)
@pytest.mark.parametrize(
"agg_kwargs",
[
{"func": ["min", "max"]},
{"func": "min"},
{"func": {1: ["min", "max"], 2: "sum"}},
{"bmin": NamedAgg(column=1, aggfunc="min")},
],
)
def test_multifunc_numba_vs_cython_frame(agg_kwargs):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, 2.0, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(**agg_kwargs, engine="numba")
expected = grouped.agg(**agg_kwargs, engine="cython")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"agg_kwargs,expected_func",
[
({"func": lambda values, index: values.sum()}, "sum"),
# FIXME
pytest.param(
{
"func": [
lambda values, index: values.sum(),
lambda values, index: values.min(),
]
},
["sum", "min"],
marks=pytest.mark.xfail(
reason="This doesn't work yet! Fails in nopython pipeline!"
),
),
],
)
def test_multifunc_numba_udf_frame(agg_kwargs, expected_func):
pytest.importorskip("numba")
data = DataFrame(
{
0: ["a", "a", "b", "b", "a"],
1: [1.0, 2.0, 3.0, 4.0, 5.0],
2: [1, 2, 3, 4, 5],
},
columns=[0, 1, 2],
)
grouped = data.groupby(0)
result = grouped.agg(**agg_kwargs, engine="numba")
expected = grouped.agg(expected_func, engine="cython")
# check_dtype can be removed if GH 44952 is addressed
# Currently, UDFs still always return float64 while reductions can preserve dtype
tm.assert_frame_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize(
"agg_kwargs",
[{"func": ["min", "max"]}, {"func": "min"}, {"min_val": "min", "max_val": "max"}],
)
def test_multifunc_numba_vs_cython_series(agg_kwargs):
pytest.importorskip("numba")
labels = ["a", "a", "b", "b", "a"]
data = Series([1.0, 2.0, 3.0, 4.0, 5.0])
grouped = data.groupby(labels)
agg_kwargs["engine"] = "numba"
result = grouped.agg(**agg_kwargs)
agg_kwargs["engine"] = "cython"
expected = grouped.agg(**agg_kwargs)
if isinstance(expected, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
@pytest.mark.single_cpu
@pytest.mark.parametrize(
"data,agg_kwargs",
[
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": ["min", "max"]}),
(Series([1.0, 2.0, 3.0, 4.0, 5.0]), {"func": "min"}),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": ["min", "max"]},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": "min"},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"func": {1: ["min", "max"], 2: "sum"}},
),
(
DataFrame(
{1: [1.0, 2.0, 3.0, 4.0, 5.0], 2: [1, 2, 3, 4, 5]}, columns=[1, 2]
),
{"min_col": NamedAgg(column=1, aggfunc="min")},
),
],
)
def test_multifunc_numba_kwarg_propagation(data, agg_kwargs):
pytest.importorskip("numba")
labels = ["a", "a", "b", "b", "a"]
grouped = data.groupby(labels)
result = grouped.agg(**agg_kwargs, engine="numba", engine_kwargs={"parallel": True})
expected = grouped.agg(**agg_kwargs, engine="numba")
if isinstance(expected, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)
def test_args_not_cached():
# GH 41647
pytest.importorskip("numba")
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.agg(sum_last, 1, engine="numba")
expected = Series([1.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
result = grouped_x.agg(sum_last, 2, engine="numba")
expected = Series([2.0] * 2, name="x", index=Index([0, 1], name="id"))
tm.assert_series_equal(result, expected)
def test_index_data_correctly_passed():
# GH 43133
pytest.importorskip("numba")
def f(values, index):
return np.mean(index)
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").aggregate(f, engine="numba")
expected = DataFrame(
[-1.5, -3.0], columns=["v"], index=Index(["A", "B"], name="group")
)
tm.assert_frame_equal(result, expected)
def test_engine_kwargs_not_cached():
# If the user passes a different set of engine_kwargs don't return the same
# jitted function
pytest.importorskip("numba")
nogil = True
parallel = False
nopython = True
def func_kwargs(values, index):
return nogil + parallel + nopython
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
df = DataFrame({"value": [0, 0, 0]})
result = df.groupby(level=0).aggregate(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
nogil = False
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby(level=0).aggregate(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
def test_multiindex_one_key(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([1.0], index=Index([1], name="A"), columns=["C"])
tm.assert_frame_equal(result, expected)
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
df.groupby(["A", "B"]).agg(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
def test_multilabel_numba_vs_cython(numba_supported_reductions):
pytest.importorskip("numba")
reduction, kwargs = numba_supported_reductions
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
res_agg = gb.agg(reduction, engine="numba", **kwargs)
expected_agg = gb.agg(reduction, engine="cython", **kwargs)
tm.assert_frame_equal(res_agg, expected_agg)
# Test that calling the aggregation directly also works
direct_res = getattr(gb, reduction)(engine="numba", **kwargs)
direct_expected = getattr(gb, reduction)(engine="cython", **kwargs)
tm.assert_frame_equal(direct_res, direct_expected)
def test_multilabel_udf_numba_vs_cython():
pytest.importorskip("numba")
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
result = gb.agg(lambda values, index: values.min(), engine="numba")
expected = gb.agg(lambda x: x.min(), engine="cython")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,675 @@
"""
test all other .agg behavior
"""
import datetime as dt
from functools import partial
import numpy as np
import pytest
from pandas.errors import SpecificationError
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
PeriodIndex,
Series,
date_range,
period_range,
)
import pandas._testing as tm
from pandas.io.formats.printing import pprint_thing
def test_agg_partial_failure_raises():
# GH#43741
df = DataFrame(
{
"data1": np.random.default_rng(2).standard_normal(5),
"data2": np.random.default_rng(2).standard_normal(5),
"key1": ["a", "a", "b", "b", "a"],
"key2": ["one", "two", "one", "two", "one"],
}
)
grouped = df.groupby("key1")
def peak_to_peak(arr):
return arr.max() - arr.min()
with pytest.raises(TypeError, match="unsupported operand type"):
grouped.agg([peak_to_peak])
with pytest.raises(TypeError, match="unsupported operand type"):
grouped.agg(peak_to_peak)
def test_agg_datetimes_mixed():
data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
df1 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
data = [
[
row[0],
(dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
row[2],
]
for row in data
]
df2 = DataFrame(
{
"key": [x[0] for x in data],
"date": [x[1] for x in data],
"value": [x[2] for x in data],
}
)
df1["weights"] = df1["value"] / df1["value"].sum()
gb1 = df1.groupby("date").aggregate("sum")
df2["weights"] = df1["value"] / df1["value"].sum()
gb2 = df2.groupby("date").aggregate("sum")
assert len(gb1) == len(gb2)
def test_agg_period_index():
prng = period_range("2012-1-1", freq="M", periods=3)
df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)), index=prng)
rs = df.groupby(level=0).sum()
assert isinstance(rs.index, PeriodIndex)
# GH 3579
index = period_range(start="1999-01", periods=5, freq="M")
s1 = Series(np.random.default_rng(2).random(len(index)), index=index)
s2 = Series(np.random.default_rng(2).random(len(index)), index=index)
df = DataFrame.from_dict({"s1": s1, "s2": s2})
grouped = df.groupby(df.index.month)
list(grouped)
def test_agg_dict_parameter_cast_result_dtypes():
# GH 12821
df = DataFrame(
{
"class": ["A", "A", "B", "B", "C", "C", "D", "D"],
"time": date_range("1/1/2011", periods=8, freq="h"),
}
)
df.loc[[0, 1, 2, 5], "time"] = None
# test for `first` function
exp = df.loc[[0, 3, 4, 6]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.first(), exp)
tm.assert_frame_equal(grouped.agg("first"), exp)
tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
tm.assert_series_equal(grouped.time.first(), exp["time"])
tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
# test for `last` function
exp = df.loc[[0, 3, 4, 7]].set_index("class")
grouped = df.groupby("class")
tm.assert_frame_equal(grouped.last(), exp)
tm.assert_frame_equal(grouped.agg("last"), exp)
tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
tm.assert_series_equal(grouped.time.last(), exp["time"])
tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
# count
exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.agg(len), exp)
tm.assert_series_equal(grouped.time.size(), exp)
exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
tm.assert_series_equal(grouped.time.count(), exp)
def test_agg_cast_results_dtypes():
# similar to GH12821
# xref #11444
u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
v = list("aaabbbbbbccd")
df = DataFrame({"X": v, "Y": u})
result = df.groupby("X")["Y"].agg(len)
expected = df.groupby("X")["Y"].count()
tm.assert_series_equal(result, expected)
def test_aggregate_float64_no_int64():
# see gh-11199
df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a"]].mean()
tm.assert_frame_equal(result, expected)
expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
expected.index.name = "b"
result = df.groupby("b")[["a", "c"]].mean()
tm.assert_frame_equal(result, expected)
def test_aggregate_api_consistency():
# GH 9052
# make sure that the aggregates via dict
# are consistent
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
grouped = df.groupby(["A", "B"])
c_mean = grouped["C"].mean()
c_sum = grouped["C"].sum()
d_mean = grouped["D"].mean()
d_sum = grouped["D"].sum()
result = grouped["D"].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean], axis=1)
expected.columns = ["sum", "mean"]
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg(["sum", "mean"])
expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped[["D", "C"]].agg(["sum", "mean"])
expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": "mean", "D": "sum"})
expected = pd.concat([d_sum, c_mean], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
msg = r"Column\(s\) \['r', 'r2'\] do not exist"
with pytest.raises(KeyError, match=msg):
grouped[["D", "C"]].agg({"r": "sum", "r2": "mean"})
def test_agg_dict_renaming_deprecation():
# 15931
df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").agg(
{"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
)
msg = r"Column\(s\) \['ma'\] do not exist"
with pytest.raises(KeyError, match=msg):
df.groupby("A")[["B", "C"]].agg({"ma": "max"})
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
df.groupby("A").B.agg({"foo": "count"})
def test_agg_compat():
# GH 12334
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": ["sum", "std"]})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"C": "sum", "D": "std"})
def test_agg_nested_dicts():
# API change for disallowing these types of nested dicts
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "two", "two", "two", "one", "two"],
"C": np.random.default_rng(2).standard_normal(8) + 1.0,
"D": np.arange(8),
}
)
g = df.groupby(["A", "B"])
msg = r"nested renamer is not supported"
with pytest.raises(SpecificationError, match=msg):
g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
with pytest.raises(SpecificationError, match=msg):
g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
# same name as the original column
# GH9052
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"result1": np.sum, "result2": np.mean})
with pytest.raises(SpecificationError, match=msg):
g["D"].agg({"D": np.sum, "result2": np.mean})
def test_agg_item_by_item_raise_typeerror():
df = DataFrame(np.random.default_rng(2).integers(10, size=(20, 10)))
def raiseException(df):
pprint_thing("----------------------------------------")
pprint_thing(df.to_string())
raise TypeError("test")
with pytest.raises(TypeError, match="test"):
df.groupby(0).agg(raiseException)
def test_series_agg_multikey():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.agg("sum")
expected = grouped.sum()
tm.assert_series_equal(result, expected)
def test_series_agg_multi_pure_python():
data = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
def bad(x):
assert len(x.values.base) > 0
return "foo"
result = data.groupby(["A", "B"]).agg(bad)
expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
tm.assert_frame_equal(result, expected)
def test_agg_consistency():
# agg with ([]) and () not consistent
# GH 6715
def P1(a):
return np.percentile(a.dropna(), q=1)
df = DataFrame(
{
"col1": [1, 2, 3, 4],
"col2": [10, 25, 26, 31],
"date": [
dt.date(2013, 2, 10),
dt.date(2013, 2, 10),
dt.date(2013, 2, 11),
dt.date(2013, 2, 11),
],
}
)
g = df.groupby("date")
expected = g.agg([P1])
expected.columns = expected.columns.levels[0]
result = g.agg(P1)
tm.assert_frame_equal(result, expected)
def test_agg_callables():
# GH 7929
df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
class fn_class:
def __call__(self, x):
return sum(x)
equiv_callables = [
sum,
np.sum,
lambda x: sum(x),
lambda x: x.sum(),
partial(sum),
fn_class(),
]
expected = df.groupby("foo").agg("sum")
for ecall in equiv_callables:
warn = FutureWarning if ecall is sum or ecall is np.sum else None
msg = "using DataFrameGroupBy.sum"
with tm.assert_produces_warning(warn, match=msg):
result = df.groupby("foo").agg(ecall)
tm.assert_frame_equal(result, expected)
def test_agg_over_numpy_arrays():
# GH 3788
df = DataFrame(
[
[1, np.array([10, 20, 30])],
[1, np.array([40, 50, 60])],
[2, np.array([20, 30, 40])],
],
columns=["category", "arraydata"],
)
gb = df.groupby("category")
expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
alt = gb.sum(numeric_only=False)
tm.assert_frame_equal(alt, expected)
result = gb.agg("sum", numeric_only=False)
tm.assert_frame_equal(result, expected)
# FIXME: the original version of this test called `gb.agg(sum)`
# and that raises TypeError if `numeric_only=False` is passed
@pytest.mark.parametrize("as_period", [True, False])
def test_agg_tzaware_non_datetime_result(as_period):
# discussed in GH#29589, fixed in GH#29641, operating on tzaware values
# with function that is not dtype-preserving
dti = date_range("2012-01-01", periods=4, tz="UTC")
if as_period:
dti = dti.tz_localize(None).to_period("D")
df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
gb = df.groupby("a")
# Case that _does_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0])
expected = Series(dti[::2], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
# Cases that do _not_ preserve the dtype
result = gb["b"].agg(lambda x: x.iloc[0].year)
expected = Series([2012, 2012], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
expected.index.name = "a"
if as_period:
expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
expected.index.name = "a"
tm.assert_series_equal(result, expected)
def test_agg_timezone_round_trip():
# GH 15426
ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
result1 = df.groupby("a")["b"].agg("min").iloc[0]
result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
result3 = df.groupby("a")["b"].min().iloc[0]
assert result1 == ts
assert result2 == ts
assert result3 == ts
dates = [
pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
]
df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
grouped = df.groupby("A")
ts = df["B"].iloc[0]
assert ts == grouped.nth(0)["B"].iloc[0]
assert ts == grouped.head(1)["B"].iloc[0]
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
def test_sum_uint64_overflow():
# see gh-14758
# Convert to uint64 and don't overflow
df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
df = df + 9223372036854775807
index = Index(
[9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
)
expected = DataFrame(
{1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
index=index,
dtype=object,
)
expected.index.name = 0
result = df.groupby(0).sum(numeric_only=False)
tm.assert_frame_equal(result, expected)
# out column is non-numeric, so with numeric_only=True it is dropped
result2 = df.groupby(0).sum(numeric_only=True)
expected2 = expected[[]]
tm.assert_frame_equal(result2, expected2)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
(list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
(
lambda x: tuple(x),
DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
),
(
lambda x: list(x),
DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
),
],
)
def test_agg_structs_dataframe(structure, expected):
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby(["A", "B"]).aggregate(structure)
expected.index.names = ["A", "B"]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"structure, expected",
[
(tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
(lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
(lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
],
)
def test_agg_structs_series(structure, expected):
# Issue #18079
df = DataFrame(
{"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
)
result = df.groupby("A")["C"].aggregate(structure)
expected.index.name = "A"
tm.assert_series_equal(result, expected)
def test_agg_category_nansum(observed):
categories = ["a", "b", "c"]
df = DataFrame(
{"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
)
msg = "using SeriesGroupBy.sum"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A", observed=observed).B.agg(np.nansum)
expected = Series(
[3, 3, 0],
index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
name="B",
)
if observed:
expected = expected[expected != 0]
tm.assert_series_equal(result, expected)
def test_agg_list_like_func():
# GH 18473
df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
grouped = df.groupby("A", as_index=False, sort=False)
result = grouped.agg({"B": lambda x: list(x)})
expected = DataFrame(
{"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
)
tm.assert_frame_equal(result, expected)
def test_agg_lambda_with_timezone():
# GH 23683
df = DataFrame(
{
"tag": [1, 1],
"date": [
pd.Timestamp("2018-01-01", tz="UTC"),
pd.Timestamp("2018-01-02", tz="UTC"),
],
}
)
result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
expected = DataFrame(
[pd.Timestamp("2018-01-01", tz="UTC")],
index=Index([1], name="tag"),
columns=["date"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"err_cls",
[
NotImplementedError,
RuntimeError,
KeyError,
IndexError,
OSError,
ValueError,
ArithmeticError,
AttributeError,
],
)
def test_groupby_agg_err_catching(err_cls):
# make sure we suppress anything other than TypeError or AssertionError
# in _python_agg_general
# Use a non-standard EA to make sure we don't go down ndarray paths
from pandas.tests.extension.decimal.array import (
DecimalArray,
make_data,
to_decimal,
)
data = make_data()[:5]
df = DataFrame(
{"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
)
expected = Series(to_decimal([data[0], data[3]]))
def weird_func(x):
# weird function that raise something other than TypeError or IndexError
# in _python_agg_general
if len(x) == 0:
raise err_cls
return x.iloc[0]
result = df["decimals"].groupby(df["id1"]).agg(weird_func)
tm.assert_series_equal(result, expected, check_names=False)

View File

@ -0,0 +1,208 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
date_range,
)
from pandas.core.groupby.base import (
reduction_kernels,
transformation_kernels,
)
@pytest.fixture(params=[True, False])
def sort(request):
return request.param
@pytest.fixture(params=[True, False])
def as_index(request):
return request.param
@pytest.fixture(params=[True, False])
def dropna(request):
return request.param
@pytest.fixture(params=[True, False])
def observed(request):
return request.param
@pytest.fixture
def df():
return DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
@pytest.fixture
def ts():
return Series(
np.random.default_rng(2).standard_normal(30),
index=date_range("2000-01-01", periods=30, freq="B"),
)
@pytest.fixture
def tsframe():
return DataFrame(
np.random.default_rng(2).standard_normal((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=date_range("2000-01-01", periods=30, freq="B"),
)
@pytest.fixture
def three_group():
return DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
"D": np.random.default_rng(2).standard_normal(11),
"E": np.random.default_rng(2).standard_normal(11),
"F": np.random.default_rng(2).standard_normal(11),
}
)
@pytest.fixture()
def slice_test_df():
data = [
[0, "a", "a0_at_0"],
[1, "b", "b0_at_1"],
[2, "a", "a1_at_2"],
[3, "b", "b1_at_3"],
[4, "c", "c0_at_4"],
[5, "a", "a2_at_5"],
[6, "a", "a3_at_6"],
[7, "a", "a4_at_7"],
]
df = DataFrame(data, columns=["Index", "Group", "Value"])
return df.set_index("Index")
@pytest.fixture()
def slice_test_grouped(slice_test_df):
return slice_test_df.groupby("Group", as_index=False)
@pytest.fixture(params=sorted(reduction_kernels))
def reduction_func(request):
"""
yields the string names of all groupby reduction functions, one at a time.
"""
return request.param
@pytest.fixture(params=sorted(transformation_kernels))
def transformation_func(request):
"""yields the string names of all groupby transformation functions."""
return request.param
@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
def groupby_func(request):
"""yields both aggregation and transformation functions."""
return request.param
@pytest.fixture(params=[True, False])
def parallel(request):
"""parallel keyword argument for numba.jit"""
return request.param
# Can parameterize nogil & nopython over True | False, but limiting per
# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472
@pytest.fixture(params=[False])
def nogil(request):
"""nogil keyword argument for numba.jit"""
return request.param
@pytest.fixture(params=[True])
def nopython(request):
"""nopython keyword argument for numba.jit"""
return request.param
@pytest.fixture(
params=[
("mean", {}),
("var", {"ddof": 1}),
("var", {"ddof": 0}),
("std", {"ddof": 1}),
("std", {"ddof": 0}),
("sum", {}),
("min", {}),
("max", {}),
("sum", {"min_count": 2}),
("min", {"min_count": 2}),
("max", {"min_count": 2}),
],
ids=[
"mean",
"var_1",
"var_0",
"std_1",
"std_0",
"sum",
"min",
"max",
"sum-min_count",
"min-min_count",
"max-min_count",
],
)
def numba_supported_reductions(request):
"""reductions supported with engine='numba'"""
return request.param

View File

@ -0,0 +1,24 @@
import numpy as np
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
def test_corrwith_with_1_axis():
# GH 47723
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
gb = df.groupby("a")
msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.corrwith(df, axis=1)
index = Index(
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
name=("a", None),
)
expected = Series([np.nan] * 6, index=index)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,297 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
def test_apply_describe_bug(multiindex_dataframe_random_data):
grouped = multiindex_dataframe_random_data.groupby(level="first")
grouped.describe() # it works!
def test_series_describe_multikey():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
def test_series_describe_single():
ts = Series(
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
)
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe().stack(future_stack=True)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
def test_series_describe_as_index(as_index, keys):
# GH#49256
df = DataFrame(
{
"key1": ["one", "two", "two", "three", "two"],
"key2": ["one", "two", "two", "three", "two"],
"foo2": [1, 2, 4, 4, 6],
}
)
gb = df.groupby(keys, as_index=as_index)["foo2"]
result = gb.describe()
expected = DataFrame(
{
"key1": ["one", "three", "two"],
"count": [1.0, 1.0, 3.0],
"mean": [1.0, 4.0, 4.0],
"std": [np.nan, np.nan, 2.0],
"min": [1.0, 4.0, 2.0],
"25%": [1.0, 4.0, 3.0],
"50%": [1.0, 4.0, 4.0],
"75%": [1.0, 4.0, 5.0],
"max": [1.0, 4.0, 6.0],
}
)
if len(keys) == 2:
expected.insert(1, "key2", expected["key1"])
if as_index:
expected = expected.set_index(keys)
tm.assert_frame_equal(result, expected)
def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
for col in tsframe:
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = MultiIndex(
levels=[[col], group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
result = groupedT.describe()
expected = tsframe.describe().T
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
expected.index = MultiIndex(
levels=[[0, 1], expected.index],
codes=[[0, 0, 1, 1], range(len(expected.index))],
)
tm.assert_frame_equal(result, expected)
def test_frame_describe_tupleindex():
# GH 14848 - regression from 0.19.0 to 0.19.1
df1 = DataFrame(
{
"x": [1, 2, 3, 4, 5] * 3,
"y": [10, 20, 30, 40, 50] * 3,
"z": [100, 200, 300, 400, 500] * 3,
}
)
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
df2 = df1.rename(columns={"k": "key"})
msg = "Names should be list-like for a MultiIndex"
with pytest.raises(ValueError, match=msg):
df1.groupby("k").describe()
with pytest.raises(ValueError, match=msg):
df2.groupby("key").describe()
def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
}
volumes = {
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
}
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
result = df.groupby("PRICE").VOLUME.describe()
data = [
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
]
expected = DataFrame(
data,
index=Index([24990, 25499], name="PRICE"),
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
"pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_describe_with_duplicate_output_column_names(as_index, keys):
# GH 35314
df = DataFrame(
{
"a1": [99, 99, 99, 88, 88, 88],
"a2": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
},
columns=["a1", "a2", "b", "b"],
copy=False,
)
if keys == ["a1"]:
df = df.drop(columns="a2")
expected = (
DataFrame.from_records(
[
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
],
)
.set_index([0, 1])
.T
)
expected.columns.names = [None, None]
if len(keys) == 2:
expected.index = MultiIndex(
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
)
else:
expected.index = Index([88, 99], name="a1")
if not as_index:
expected = expected.reset_index()
result = df.groupby(keys, as_index=as_index).describe()
tm.assert_frame_equal(result, expected)
def test_describe_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb.describe(percentiles=[])
columns = ["count", "mean", "std", "min", "50%", "max"]
frames = [
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
for val in (0.0, 2.0, 3.0)
]
expected = pd.concat(frames, axis=1)
expected.columns = MultiIndex(
levels=[[0, 2], columns],
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
)
expected.index.names = [1]
tm.assert_frame_equal(result, expected)
class TestGroupByNonCythonPaths:
# GH#5610 non-cython calls should not include the grouper
# Tests for code not expected to go through cython paths.
@pytest.fixture
def df(self):
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
columns=["A", "B", "C"],
)
return df
@pytest.fixture
def gb(self, df):
gb = df.groupby("A")
return gb
@pytest.fixture
def gni(self, df):
gni = df.groupby("A", as_index=False)
return gni
def test_describe(self, df, gb, gni):
# describe
expected_index = Index([1, 3], name="A")
expected_col = MultiIndex(
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
codes=[[0] * 8, list(range(8))],
)
expected = DataFrame(
[
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
],
index=expected_index,
columns=expected_col,
)
result = gb.describe()
tm.assert_frame_equal(result, expected)
expected = expected.reset_index()
result = gni.describe()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", [int, float, object])
@pytest.mark.parametrize(
"kwargs",
[
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
],
)
def test_groupby_empty_dataset(dtype, kwargs):
# GH#41575
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
df["B"] = df["B"].astype(int)
df["C"] = df["C"].astype(float)
result = df.iloc[:0].groupby("A").describe(**kwargs)
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
tm.assert_frame_equal(result, expected)
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
expected.index = Index([])
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,255 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
NaT,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
def test_group_shift_with_null_key():
# This test is designed to replicate the segfault in issue #13813.
n_rows = 1200
# Generate a moderately large dataframe with occasional missing
# values in column `B`, and then group by [`A`, `B`]. This should
# force `-1` in `labels` array of `g._grouper.group_info` exactly
# at those places, where the group-by key is partially missing.
df = DataFrame(
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1)
tm.assert_frame_equal(result, expected)
def test_group_shift_with_fill_value():
# GH #24128
n_rows = 24
df = DataFrame(
[(i % 12, i % 3, i) for i in range(n_rows)],
dtype=float,
columns=["A", "B", "Z"],
index=None,
)
g = df.groupby(["A", "B"])
expected = DataFrame(
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
dtype=float,
columns=["Z"],
index=None,
)
result = g.shift(-1, fill_value=0)
tm.assert_frame_equal(result, expected)
def test_group_shift_lose_timezone():
# GH 30134
now_dt = Timestamp.utcnow().as_unit("ns")
df = DataFrame({"a": [1, 1], "date": now_dt})
result = df.groupby("a").shift(0).iloc[0]
expected = Series({"date": now_dt}, name=result.name)
tm.assert_series_equal(result, expected)
def test_group_diff_real_series(any_real_numpy_dtype):
df = DataFrame(
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
dtype=any_real_numpy_dtype,
)
result = df.groupby("a")["b"].diff()
exp_dtype = "float"
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
exp_dtype = "float32"
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_real_frame(any_real_numpy_dtype):
df = DataFrame(
{
"a": [1, 2, 3, 3, 2],
"b": [1, 2, 3, 4, 5],
"c": [1, 2, 3, 4, 6],
},
dtype=any_real_numpy_dtype,
)
result = df.groupby("a").diff()
exp_dtype = "float"
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
exp_dtype = "float32"
expected = DataFrame(
{
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
},
dtype=exp_dtype,
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
[
Timestamp("2013-01-01"),
Timestamp("2013-01-02"),
Timestamp("2013-01-03"),
],
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
],
)
def test_group_diff_datetimelike(data, unit):
df = DataFrame({"a": [1, 2, 2], "b": data})
df["b"] = df["b"].dt.as_unit(unit)
result = df.groupby("a")["b"].diff()
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
tm.assert_series_equal(result, expected)
def test_group_diff_bool():
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
result = df.groupby("a")["b"].diff()
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
tm.assert_series_equal(result, expected)
def test_group_diff_object_raises(object_dtype):
df = DataFrame(
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
)
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
df.groupby("a")["b"].diff()
def test_empty_shift_with_fill():
# GH 41264, single-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a"]).shift(1)
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
def test_multindex_empty_shift_with_fill():
# GH 41264, multi-index check
df = DataFrame(columns=["a", "b", "c"])
shifted = df.groupby(["a", "b"]).shift(1)
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
tm.assert_frame_equal(shifted, shifted_with_fill)
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
def test_shift_periods_freq():
# GH 54093
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data, index=date_range(start="20100101", periods=6))
result = df.groupby(df.index).shift(periods=-2, freq="D")
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
tm.assert_frame_equal(result, expected)
def test_shift_deprecate_freq_and_fill_value():
# GH 53832
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data, index=date_range(start="20100101", periods=6))
msg = (
"Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
def test_shift_disallow_suffix_if_periods_is_int():
# GH#44424
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
df = DataFrame(data)
msg = "Cannot specify `suffix` if `periods` is an int."
with pytest.raises(ValueError, match=msg):
df.groupby("b").shift(1, suffix="fails")
def test_group_shift_with_multiple_periods():
# GH#44424
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
expected_df = DataFrame(
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
)
tm.assert_frame_equal(shifted_df, expected_df)
# series
shifted_series = df.groupby("b")["a"].shift([0, 1])
tm.assert_frame_equal(shifted_series, expected_df)
def test_group_shift_with_multiple_periods_and_freq():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
index=date_range("1/1/2000", periods=5, freq="h"),
)
shifted_df = df.groupby("b")[["a"]].shift(
[0, 1],
freq="h",
)
expected_df = DataFrame(
{
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
"a_1": [
np.nan,
1.0,
2.0,
3.0,
4.0,
5.0,
],
},
index=date_range("1/1/2000", periods=6, freq="h"),
)
tm.assert_frame_equal(shifted_df, expected_df)
def test_group_shift_with_multiple_periods_and_fill_value():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
)
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
expected_df = DataFrame(
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
)
tm.assert_frame_equal(shifted_df, expected_df)
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
# GH#44424
df = DataFrame(
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
index=date_range("1/1/2000", periods=5, freq="h"),
)
msg = (
"Passing a 'freq' together with a 'fill_value' silently ignores the "
"fill_value"
)
with tm.assert_produces_warning(FutureWarning, match=msg):
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")

View File

@ -0,0 +1,78 @@
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly increasing (T), strictly decreasing (F),
# abs val increasing (F), non-strictly increasing (T)
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
[True, False, True, False],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_increasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_increasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
# Also check result equal to manually taking x.is_monotonic_increasing.
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly decreasing (T), strictly increasing (F),
# abs val decreasing (F), non-strictly increasing (T)
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
[True, True, False, True],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_decreasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_decreasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,115 @@
import numpy as np
import pytest
from pandas import (
MultiIndex,
Series,
date_range,
)
import pandas._testing as tm
def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nlargest(3)
e = Series(
[7, 5, 3, 10, 9, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[3, 2, 1, 3, 3, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
)
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
def test_nlargest_mi_grouper():
# see gh-21411
npr = np.random.default_rng(2)
dts = date_range("20180101", periods=10)
iterables = [dts, ["one", "two"]]
idx = MultiIndex.from_product(iterables, names=["first", "second"])
s = Series(npr.standard_normal(20), index=idx)
result = s.groupby("first").nlargest(1)
exp_idx = MultiIndex.from_tuples(
[
(dts[0], dts[0], "one"),
(dts[1], dts[1], "one"),
(dts[2], dts[2], "one"),
(dts[3], dts[3], "two"),
(dts[4], dts[4], "one"),
(dts[5], dts[5], "one"),
(dts[6], dts[6], "one"),
(dts[7], dts[7], "one"),
(dts[8], dts[8], "one"),
(dts[9], dts[9], "one"),
],
names=["first", "first", "second"],
)
exp_values = [
0.18905338179353307,
-0.41306354339189344,
1.799707382720902,
0.7738065867276614,
0.28121066979764925,
0.9775674511260357,
-0.3288239040579627,
0.45495807124085547,
0.5452887139646817,
0.12682784711186987,
]
expected = Series(exp_values, index=exp_idx)
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
def test_nsmallest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
gb = a.groupby(b)
r = gb.nsmallest(3)
e = Series(
[1, 2, 3, 0, 4, 6],
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
)
tm.assert_series_equal(r, e)
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
gb = a.groupby(b)
e = Series(
[0, 1, 1, 0, 1, 2],
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
)
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
@pytest.mark.parametrize(
"data, groups",
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
)
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
def test_nlargest_and_smallest_noop(data, groups, dtype, method):
# GH 15272, GH 16345, GH 29129
# Test nlargest/smallest when it results in a noop,
# i.e. input is sorted and group size <= n
if dtype is not None:
data = np.array(data, dtype=dtype)
if method == "nlargest":
data = list(reversed(data))
ser = Series(data, name="a")
result = getattr(ser.groupby(groups), method)(n=2)
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,921 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
isna,
)
import pandas._testing as tm
def test_first_last_nth(df):
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
nth = grouped.nth(0)
expected = df.loc[[0, 1]]
tm.assert_frame_equal(nth, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D"]]
expected.index = Index(["bar", "foo"], name="A")
tm.assert_frame_equal(last, expected)
nth = grouped.nth(-1)
expected = df.iloc[[5, 7]]
tm.assert_frame_equal(nth, expected)
nth = grouped.nth(1)
expected = df.iloc[[2, 3]]
tm.assert_frame_equal(nth, expected)
# it works!
grouped["B"].first()
grouped["B"].last()
grouped["B"].nth(0)
df = df.copy()
df.loc[df["A"] == "foo", "B"] = np.nan
grouped = df.groupby("A")
assert isna(grouped["B"].first()["foo"])
assert isna(grouped["B"].last()["foo"])
assert isna(grouped["B"].nth(0).iloc[0])
# v0.14.0 whatsnew
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
result = g.first()
expected = df.iloc[[1, 2]].set_index("A")
tm.assert_frame_equal(result, expected)
expected = df.iloc[[1, 2]]
result = g.nth(0, dropna="any")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_na_object(method, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
result = getattr(groups, method)()
if method == "first":
values = [1, 3]
else:
values = [2, 3]
values = np.array(values, dtype=result["b"].dtype)
idx = Index([1, 2], name="a")
expected = DataFrame({"b": values}, index=idx)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index", [0, -1])
def test_nth_with_na_object(index, nulls_fixture):
# https://github.com/pandas-dev/pandas/issues/32123
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
groups = df.groupby("a")
result = groups.nth(index)
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last_with_None(method):
# https://github.com/pandas-dev/pandas/issues/32800
# None should be preserved as object dtype
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
groups = df.groupby("id", as_index=False)
result = getattr(groups, method)()
tm.assert_frame_equal(result, df)
@pytest.mark.parametrize("method", ["first", "last"])
@pytest.mark.parametrize(
"df, expected",
[
(
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
),
(
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
),
],
)
def test_first_last_with_None_expanded(method, df, expected):
# GH 32800, 38286
result = getattr(df.groupby("id"), method)()
tm.assert_frame_equal(result, expected)
def test_first_last_nth_dtypes():
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
}
)
df["E"] = True
df["F"] = 1
# tests for first / last / nth
grouped = df.groupby("A")
first = grouped.first()
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(first, expected)
last = grouped.last()
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
expected.index = Index(["bar", "foo"], name="A")
expected = expected.sort_index()
tm.assert_frame_equal(last, expected)
nth = grouped.nth(1)
expected = df.iloc[[2, 3]]
tm.assert_frame_equal(nth, expected)
def test_first_last_nth_dtypes2():
# GH 2763, first/last shifting dtypes
idx = list(range(10))
idx.append(9)
ser = Series(data=range(11), index=idx, name="IntCol")
assert ser.dtype == "int64"
f = ser.groupby(level=0).first()
assert f.dtype == "int64"
def test_first_last_nth_nan_dtype():
# GH 33591
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
grouped = df.groupby("data")
expected = df.set_index("data").nans
tm.assert_series_equal(grouped.nans.first(), expected)
tm.assert_series_equal(grouped.nans.last(), expected)
expected = df.nans
tm.assert_series_equal(grouped.nans.nth(-1), expected)
tm.assert_series_equal(grouped.nans.nth(0), expected)
def test_first_strings_timestamps():
# GH 11244
test = DataFrame(
{
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
"name": ["e", "e"],
"aaaa": ["f", "g"],
}
)
result = test.groupby("name").first()
expected = DataFrame(
[["a", "c", "f"]],
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
index=Index(["e"], name="name"),
)
tm.assert_frame_equal(result, expected)
def test_nth():
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
gb = df.groupby("A")
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
def test_nth2():
# out of bounds, regression from 0.13.1
# GH 6621
df = DataFrame(
{
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
"two": {
0: 1.5456590000000001,
1: -0.070345000000000005,
2: -2.4004539999999999,
3: 0.46206000000000003,
4: 0.52350799999999997,
},
"one": {
0: 0.56573799999999996,
1: -0.9742360000000001,
2: 1.033801,
3: -0.78543499999999999,
4: 0.70422799999999997,
},
}
).set_index(["color", "food"])
result = df.groupby(level=0, as_index=False).nth(2)
expected = df.iloc[[-1]]
tm.assert_frame_equal(result, expected)
result = df.groupby(level=0, as_index=False).nth(3)
expected = df.loc[[]]
tm.assert_frame_equal(result, expected)
def test_nth3():
# GH 7559
# from the vbench
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
ser = df[1]
gb = df[0]
expected = ser.groupby(gb).first()
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
tm.assert_series_equal(expected2, expected, check_names=False)
assert expected.name == 1
assert expected2.name == 1
# validate first
v = ser[gb == 1].iloc[0]
assert expected.iloc[0] == v
assert expected2.iloc[0] == v
with pytest.raises(ValueError, match="For a DataFrame"):
ser.groupby(gb, sort=False).nth(0, dropna=True)
def test_nth4():
# doc example
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
gb = df.groupby("A")
result = gb.B.nth(0, dropna="all")
expected = df.B.iloc[[1, 2]]
tm.assert_series_equal(result, expected)
def test_nth5():
# test multiple nth values
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
gb = df.groupby("A")
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
def test_nth_bdays(unit):
business_dates = pd.date_range(
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
)
df = DataFrame(1, index=business_dates, columns=["a", "b"])
# get the first, fourth and last two business days for each month
key = [df.index.year, df.index.month]
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
expected_dates = pd.to_datetime(
[
"2014/4/1",
"2014/4/4",
"2014/4/29",
"2014/4/30",
"2014/5/1",
"2014/5/6",
"2014/5/29",
"2014/5/30",
"2014/6/2",
"2014/6/5",
"2014/6/27",
"2014/6/30",
]
).as_unit(unit)
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
tm.assert_frame_equal(result, expected)
def test_nth_multi_grouper(three_group):
# PR 9090, related to issue 8979
# test nth on multiple groupers
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = three_group.iloc[[0, 3, 4, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"data, expected_first, expected_last",
[
(
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
{
"id": ["A"],
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
"foo": [1],
},
),
(
{
"id": ["A", "B", "A"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
],
"foo": [1, 2, 3],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [1, 2],
},
{
"id": ["A", "B"],
"time": [
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
],
"foo": [3, 2],
},
),
],
)
def test_first_last_tz(data, expected_first, expected_last):
# GH15884
# Test that the timezone is retained when calling first
# or last on groupby with as_index=False
df = DataFrame(data)
result = df.groupby("id", as_index=False).first()
expected = DataFrame(expected_first)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].first()
tm.assert_frame_equal(result, expected[["id", "time"]])
result = df.groupby("id", as_index=False).last()
expected = DataFrame(expected_last)
cols = ["id", "time", "foo"]
tm.assert_frame_equal(result[cols], expected[cols])
result = df.groupby("id", as_index=False)["time"].last()
tm.assert_frame_equal(result, expected[["id", "time"]])
@pytest.mark.parametrize(
"method, ts, alpha",
[
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
],
)
def test_first_last_tz_multi_column(method, ts, alpha, unit):
# GH 21603
category_string = Series(list("abc")).astype("category")
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
df = DataFrame(
{
"group": [1, 1, 2],
"category_string": category_string,
"datetimetz": dti,
}
)
result = getattr(df.groupby("group"), method)()
expected = DataFrame(
{
"category_string": pd.Categorical(
[alpha, "c"], dtype=category_string.dtype
),
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
},
index=Index([1, 2], name="group"),
)
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([True, False], dtype="boolean"),
pd.array([1, 2], dtype="Int64"),
pd.to_datetime(["2020-01-01", "2020-02-01"]),
pd.to_timedelta([1, 2], unit="D"),
],
)
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
def test_first_last_extension_array_keeps_dtype(values, function):
# https://github.com/pandas-dev/pandas/issues/33071
# https://github.com/pandas-dev/pandas/issues/32194
df = DataFrame({"a": [1, 2], "b": values})
grouped = df.groupby("a")
idx = Index([1, 2], name="a")
expected_series = Series(values, name="b", index=idx)
expected_frame = DataFrame({"b": values}, index=idx)
result_series = getattr(grouped["b"], function)()
tm.assert_series_equal(result_series, expected_series)
result_frame = grouped.agg({"b": function})
tm.assert_frame_equal(result_frame, expected_frame)
def test_nth_multi_index_as_expected():
# PR 9090, related to issue 8979
# test nth on MultiIndex
three_group = DataFrame(
{
"A": [
"foo",
"foo",
"foo",
"foo",
"bar",
"bar",
"bar",
"bar",
"foo",
"foo",
"foo",
],
"B": [
"one",
"one",
"one",
"two",
"one",
"one",
"one",
"two",
"two",
"two",
"one",
],
"C": [
"dull",
"dull",
"shiny",
"dull",
"dull",
"shiny",
"shiny",
"dull",
"shiny",
"shiny",
"shiny",
],
}
)
grouped = three_group.groupby(["A", "B"])
result = grouped.nth(0)
expected = three_group.iloc[[0, 3, 4, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, n, expected_rows",
[
("head", -1, [0]),
("head", 0, []),
("head", 1, [0, 2]),
("head", 7, [0, 1, 2]),
("tail", -1, [1]),
("tail", 0, []),
("tail", 1, [1, 2]),
("tail", 7, [0, 1, 2]),
],
)
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
g = df.groupby("A", as_index=as_index)
expected = df.iloc[expected_rows]
if columns is not None:
g = g[columns]
expected = expected[columns]
result = getattr(g, op)(n)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"op, n, expected_cols",
[
("head", -1, [0]),
("head", 0, []),
("head", 1, [0, 2]),
("head", 7, [0, 1, 2]),
("tail", -1, [1]),
("tail", 0, []),
("tail", 1, [1, 2]),
("tail", 7, [0, 1, 2]),
],
)
def test_groupby_head_tail_axis_1(op, n, expected_cols):
# GH 9772
df = DataFrame(
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
g = df.groupby([0, 0, 1], axis=1)
expected = df.iloc[:, expected_cols]
result = getattr(g, op)(n)
tm.assert_frame_equal(result, expected)
def test_group_selection_cache():
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
expected = df.iloc[[0, 2]]
g = df.groupby("A")
result1 = g.head(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.tail(n=2)
result2 = g.nth(0)
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, expected)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.head(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
g = df.groupby("A")
result1 = g.nth(0)
result2 = g.tail(n=2)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, df)
def test_nth_empty():
# GH 16064
df = DataFrame(index=[0], columns=["a", "b", "c"])
result = df.groupby("a").nth(10)
expected = df.iloc[:0]
tm.assert_frame_equal(result, expected)
result = df.groupby(["a", "b"]).nth(10)
expected = df.iloc[:0]
tm.assert_frame_equal(result, expected)
def test_nth_column_order():
# GH 20760
# Check that nth preserves column order
df = DataFrame(
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
columns=["A", "C", "B"],
)
result = df.groupby("A").nth(0)
expected = df.iloc[[0, 3]]
tm.assert_frame_equal(result, expected)
result = df.groupby("A").nth(-1, dropna="any")
expected = df.iloc[[1, 4]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper(dropna):
# GH 26011
df = DataFrame(
{
"a": [np.nan, "a", np.nan, "b", np.nan],
"b": [0, 2, 4, 6, 8],
"c": [1, 3, 5, 7, 9],
}
)
result = df.groupby("a").nth(0, dropna=dropna)
expected = df.iloc[[1, 3]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dropna", [None, "any", "all"])
def test_nth_nan_in_grouper_series(dropna):
# GH 26454
df = DataFrame(
{
"a": [np.nan, "a", np.nan, "b", np.nan],
"b": [0, 2, 4, 6, 8],
}
)
result = df.groupby("a")["b"].nth(0, dropna=dropna)
expected = df["b"].iloc[[1, 3]]
tm.assert_series_equal(result, expected)
def test_first_categorical_and_datetime_data_nat():
# GH 20520
df = DataFrame(
{
"group": ["first", "first", "second", "third", "third"],
"time": 5 * [np.datetime64("NaT")],
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
}
)
result = df.groupby("group").first()
expected = DataFrame(
{
"time": 3 * [np.datetime64("NaT")],
"categories": Series(["a", "c", "a"]).astype(
pd.CategoricalDtype(["a", "b", "c"])
),
}
)
expected.index = Index(["first", "second", "third"], name="group")
tm.assert_frame_equal(result, expected)
def test_first_multi_key_groupby_categorical():
# GH 22512
df = DataFrame(
{
"A": [1, 1, 1, 2, 2],
"B": [100, 100, 200, 100, 100],
"C": ["apple", "orange", "mango", "mango", "orange"],
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
}
)
df = df.astype({"D": "category"})
result = df.groupby(by=["A", "B"]).first()
expected = DataFrame(
{
"C": ["apple", "mango", "mango"],
"D": Series(["jupiter", "mars", "venus"]).astype(
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
),
}
)
expected.index = MultiIndex.from_tuples(
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["first", "last", "nth"])
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
# GH29645
expected = Series(["y"])
data = Series(
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
index=[0, 0, 0, 0, 0],
).groupby(level=0)
if method == "nth":
result = getattr(data, method)(3)
else:
result = getattr(data, method)()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[slice(None, 3, 2), [0, 1, 4, 5]],
[slice(None, -2), [0, 2, 5]],
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test slices GH #42947
result = slice_test_grouped.nth[arg]
equivalent = slice_test_grouped.nth(arg)
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_nth_indexed(slice_test_df, slice_test_grouped):
# Test index notation GH #44688
result = slice_test_grouped.nth[0, 1, -2:]
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(equivalent, expected)
def test_invalid_argument(slice_test_grouped):
# Test for error on invalid argument
with pytest.raises(TypeError, match="Invalid index"):
slice_test_grouped.nth(3.14)
def test_negative_step(slice_test_grouped):
# Test for error on negative slice step
with pytest.raises(ValueError, match="Invalid step"):
slice_test_grouped.nth(slice(None, None, -1))
def test_np_ints(slice_test_df, slice_test_grouped):
# Test np ints work
result = slice_test_grouped.nth(np.array([0, 1]))
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
tm.assert_frame_equal(result, expected)
def test_groupby_nth_with_column_axis():
# GH43926
df = DataFrame(
[
[4, 5, 6],
[8, 8, 7],
],
index=["z", "y"],
columns=["C", "B", "A"],
)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(df.iloc[1], axis=1)
result = gb.nth(0)
expected = df.iloc[:, [0, 2]]
tm.assert_frame_equal(result, expected)
def test_groupby_nth_interval():
# GH#24205
idx_result = MultiIndex(
[
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
],
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
)
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
val_expected = [0, 1, 3]
idx_expected = MultiIndex(
[
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
],
[[0, 0, 1], [0, 1, 0]],
)
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"start, stop, expected_values, expected_columns",
[
(None, None, [0, 1, 2, 3, 4], list("ABCDE")),
(None, 1, [0, 3], list("AD")),
(None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
(None, -1, [0, 1, 3], list("ABD")),
(1, None, [1, 2, 4], list("BCE")),
(1, -1, [1], list("B")),
(-1, None, [2, 4], list("CE")),
(-1, 2, [4], list("E")),
],
)
@pytest.mark.parametrize("method", ["call", "index"])
def test_nth_slices_with_column_axis(
start, stop, expected_values, expected_columns, method
):
df = DataFrame([range(5)], columns=[list("ABCDE")])
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
result = {
"call": lambda start, stop: gb.nth(slice(start, stop)),
"index": lambda start, stop: gb.nth[start:stop],
}[method](start, stop)
expected = DataFrame([expected_values], columns=[expected_columns])
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings(
"ignore:invalid value encountered in remainder:RuntimeWarning"
)
def test_head_tail_dropna_true():
# GH#45089
df = DataFrame(
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
)
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
result = df.groupby(["X", "Y"]).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"]).nth(n=0)
tm.assert_frame_equal(result, expected)
def test_head_tail_dropna_false():
# GH#45089
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
tm.assert_frame_equal(result, expected)
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
@pytest.mark.parametrize("dropna", ["any", "all", None])
def test_nth_after_selection(selection, dropna):
# GH#11038, GH#53518
df = DataFrame(
{
"a": [1, 1, 2],
"b": [np.nan, 3, 4],
"c": [5, 6, 7],
}
)
gb = df.groupby("a")[selection]
result = gb.nth(0, dropna=dropna)
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
locs = [1, 2]
else:
locs = [0, 2]
expected = df.loc[locs, selection]
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"data",
[
(
Timestamp("2011-01-15 12:50:28.502376"),
Timestamp("2011-01-20 12:50:28.593448"),
),
(24650000000000001, 24650000000000002),
],
)
def test_groupby_nth_int_like_precision(data):
# GH#6620, GH#9311
df = DataFrame({"a": [1, 1], "b": data})
grouped = df.groupby("a")
result = grouped.nth(0)
expected = DataFrame({"a": 1, "b": [data[0]]})
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,496 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
)
@pytest.mark.parametrize(
"a_vals,b_vals",
[
# Ints
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
([1, 2, 3, 4], [4, 3, 2, 1]),
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
# Floats
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
# Missing data
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
# Timestamps
(
pd.date_range("1/1/18", freq="D", periods=5),
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
),
(
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
),
# All NA
([np.nan] * 5, [np.nan] * 5),
],
)
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
def test_quantile(interpolation, a_vals, b_vals, q, request):
if (
interpolation == "nearest"
and q == 0.5
and isinstance(b_vals, list)
and b_vals == [4, 3, 2, 1]
):
request.applymarker(
pytest.mark.xfail(
reason="Unclear numpy expectation for nearest "
"result with equidistant data"
)
)
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
expected = DataFrame(
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
)
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
# TODO(non-nano): this should be unnecessary once array_to_datetime
# correctly infers non-nano from Timestamp.unit
expected = expected.astype(all_vals.dtype)
result = df.groupby("key").quantile(q, interpolation=interpolation)
tm.assert_frame_equal(result, expected)
def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = DataFrame({"A": [0, 1, 2, 3, 4]})
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
result = df.groupby(key).quantile([0.25])
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
key = np.array([0, 0, 1, 1], dtype=np.int64)
result = df.groupby(key).quantile([0.25, 0.75])
expected = DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)
def test_quantile_array2():
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
df = DataFrame(arr, columns=list("ABC"))
result = df.groupby("A").quantile([0.3, 0.7])
expected = DataFrame(
{
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
},
index=pd.MultiIndex.from_product(
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_no_sort():
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
key = np.array([1, 0, 1], dtype=np.int64)
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
expected = DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
expected = DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)
def test_quantile_array_multiple_levels():
df = DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
@pytest.mark.parametrize("q", [[0.5, 0.6]])
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
# GH30289
nrow, ncol = frame_size
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
list(range(len(q))) * min(nrow, 4)
]
expected_index = pd.MultiIndex(
levels=idx_levels, codes=idx_codes, names=groupby + [None]
)
expected_values = [
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
]
expected_columns = [x for x in range(ncol) if x not in groupby]
expected = DataFrame(
expected_values, index=expected_index, columns=expected_columns
)
result = df.groupby(groupby).quantile(q)
tm.assert_frame_equal(result, expected)
def test_quantile_raises():
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
df.groupby("key").quantile()
def test_quantile_out_of_bounds_q_raises():
# https://github.com/pandas-dev/pandas/issues/27470
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
g = df.groupby([0, 0, 0, 1, 1, 1])
with pytest.raises(ValueError, match="Got '50.0' instead"):
g.quantile(50)
with pytest.raises(ValueError, match="Got '-1.0' instead"):
g.quantile(-1)
def test_quantile_missing_group_values_no_segfaults():
# GH 28662
data = np.array([1.0, np.nan, 1.0])
df = DataFrame({"key": data, "val": range(3)})
# Random segfaults; would have been guaranteed in loop
grp = df.groupby("key")
for _ in range(100):
grp.quantile()
@pytest.mark.parametrize(
"key, val, expected_key, expected_val",
[
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
([0], [42], [0], [42.0]),
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
],
)
def test_quantile_missing_group_values_correct_results(
key, val, expected_key, expected_val
):
# GH 28662, GH 33200, GH 33569
df = DataFrame({"key": key, "val": val})
expected = DataFrame(
expected_val, index=Index(expected_key, name="key"), columns=["val"]
)
grp = df.groupby("key")
result = grp.quantile(0.5)
tm.assert_frame_equal(result, expected)
result = grp.quantile()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"values",
[
pd.array([1, 0, None] * 2, dtype="Int64"),
pd.array([True, False, None] * 2, dtype="boolean"),
],
)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
def test_groupby_quantile_nullable_array(values, q):
# https://github.com/pandas-dev/pandas/issues/33136
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
result = df.groupby("a")["b"].quantile(q)
if isinstance(q, list):
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
true_quantiles = [0.0, 0.5, 1.0]
else:
idx = Index(["x", "y"], name="a")
true_quantiles = [0.5]
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
@pytest.mark.parametrize("numeric_only", [True, False])
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
if numeric_only:
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
expected = df.groupby("a")[["b"]].quantile(q)
tm.assert_frame_equal(result, expected)
else:
with pytest.raises(
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
):
df.groupby("a").quantile(q, numeric_only=numeric_only)
def test_groupby_quantile_NA_float(any_float_dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
result = df.groupby("x")["y"].quantile(0.5)
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
if any_float_dtype in ["Float32", "Float64"]:
expected_dtype = any_float_dtype
else:
expected_dtype = None
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
tm.assert_series_equal(result, expected)
result = df.groupby("x")["y"].quantile([0.5, 0.75])
expected = pd.Series(
[0.2] * 2,
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
name="y",
dtype=expected_dtype,
)
tm.assert_series_equal(result, expected)
def test_groupby_quantile_NA_int(any_int_ea_dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[3.5],
dtype="Float64",
index=Index([1], name="x", dtype=any_int_ea_dtype),
name="y",
)
tm.assert_series_equal(expected, result)
result = df.groupby("x").quantile(0.5)
expected = DataFrame(
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
)
def test_groupby_quantile_all_na_group_masked(
interpolation, val1, val2, any_numeric_ea_dtype
):
# GH#37493
df = DataFrame(
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
)
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
expected = DataFrame(
{"b": [val1, val2, pd.NA, pd.NA]},
dtype=any_numeric_ea_dtype,
index=pd.MultiIndex.from_arrays(
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
names=["a", None],
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
def test_groupby_quantile_all_na_group_masked_interp(
interpolation, any_numeric_ea_dtype
):
# GH#37493
df = DataFrame(
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
)
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
if any_numeric_ea_dtype == "Float32":
expected_dtype = any_numeric_ea_dtype
else:
expected_dtype = "Float64"
expected = DataFrame(
{"b": [2.0, 2.5, pd.NA, pd.NA]},
dtype=expected_dtype,
index=pd.MultiIndex.from_arrays(
[
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
[0.5, 0.75, 0.5, 0.75],
],
names=["a", None],
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
def test_groupby_quantile_allNA_column(dtype):
# GH#42849
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
result = df.groupby("x")["y"].quantile(0.5)
expected = pd.Series(
[np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
)
expected.index.name = "x"
tm.assert_series_equal(expected, result)
def test_groupby_timedelta_quantile():
# GH: 29485
df = DataFrame(
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
)
result = df.groupby("group").quantile(0.99)
expected = DataFrame(
{
"value": [
pd.Timedelta("0 days 00:00:00.990000"),
pd.Timedelta("0 days 00:00:02.990000"),
]
},
index=Index([1, 2], name="group"),
)
tm.assert_frame_equal(result, expected)
def test_columns_groupby_quantile():
# GH 33795
df = DataFrame(
np.arange(12).reshape(3, -1),
index=list("XYZ"),
columns=pd.Series(list("ABAB"), name="col"),
)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby("col", axis=1)
result = gb.quantile(q=[0.8, 0.2])
expected = DataFrame(
[
[1.6, 0.4, 2.6, 1.4],
[5.6, 4.4, 6.6, 5.4],
[9.6, 8.4, 10.6, 9.4],
],
index=list("XYZ"),
columns=pd.MultiIndex.from_tuples(
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
),
)
tm.assert_frame_equal(result, expected)
def test_timestamp_groupby_quantile(unit):
# GH 33168
dti = pd.date_range(
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
).floor("1h")
df = DataFrame(
{
"timestamp": dti,
"category": list(range(1, 101)),
"value": list(range(101, 201)),
}
)
result = df.groupby("timestamp").quantile([0.2, 0.8])
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
expected = DataFrame(
[
{"category": 12.8, "value": 112.8},
{"category": 48.2, "value": 148.2},
{"category": 68.8, "value": 168.8},
{"category": 92.2, "value": 192.2},
],
index=mi,
)
tm.assert_frame_equal(result, expected)
def test_groupby_quantile_dt64tz_period():
# GH#51373
dti = pd.date_range("2016-01-01", periods=1000)
df = pd.Series(dti).to_frame().copy()
df[1] = dti.tz_localize("US/Pacific")
df[2] = dti.to_period("D")
df[3] = dti - dti[0]
df.iloc[-1] = pd.NaT
by = np.tile(np.arange(5), 200)
gb = df.groupby(by)
result = gb.quantile(0.5)
# Check that we match the group-by-group result
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
expected = DataFrame(exp).T.infer_objects()
expected.index = expected.index.astype(int)
tm.assert_frame_equal(result, expected)
def test_groupby_quantile_nonmulti_levels_order():
# Non-regression test for GH #53009
ind = pd.MultiIndex.from_tuples(
[
(0, "a", "B"),
(0, "a", "A"),
(0, "b", "B"),
(0, "b", "A"),
(1, "a", "B"),
(1, "a", "A"),
(1, "b", "B"),
(1, "b", "A"),
],
names=["sample", "cat0", "cat1"],
)
ser = pd.Series(range(8), index=ind)
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
qind = pd.MultiIndex.from_tuples(
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
)
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
tm.assert_series_equal(result, expected)
# We need to check that index levels are not sorted
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
tm.assert_equal(result.index.levels, expected_levels)

View File

@ -0,0 +1,721 @@
from datetime import datetime
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
NaT,
Series,
concat,
)
import pandas._testing as tm
def test_rank_unordered_categorical_typeerror():
# GH#51034 should be TypeError, not NotImplementedError
cat = pd.Categorical([], ordered=False)
ser = Series(cat)
df = ser.to_frame()
msg = "Cannot perform rank with non-ordered Categorical"
gb = ser.groupby(cat, observed=False)
with pytest.raises(TypeError, match=msg):
gb.rank()
gb2 = df.groupby(cat, observed=False)
with pytest.raises(TypeError, match=msg):
gb2.rank()
def test_rank_apply():
lev1 = np.array(["a" * 10] * 100, dtype=object)
lev2 = np.array(["b" * 10] * 130, dtype=object)
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
df = DataFrame(
{
"value": np.random.default_rng(2).standard_normal(500),
"key1": lev1.take(lab1),
"key2": lev2.take(lab2),
}
)
result = df.groupby(["key1", "key2"]).value.rank()
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
expected = [
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
]
expected = concat(expected, axis=0)
expected = expected.reindex(result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, 8, 2, 6], dtype=dtype)
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,pct,exp",
[
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
],
)
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,exp",
[
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
],
)
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
# GH 20561
key = np.repeat(grps, len(vals))
vals = vals * len(grps)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
@pytest.mark.parametrize(
"vals",
[
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
for dtype in ["f8", "f4", "f2"]
]
+ [
[
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-02"),
np.nan,
pd.Timestamp("2018-01-08"),
pd.Timestamp("2018-01-02"),
pd.Timestamp("2018-01-06"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
np.nan,
pd.Timestamp("2018-01-08", tz="US/Pacific"),
pd.Timestamp("2018-01-02", tz="US/Pacific"),
pd.Timestamp("2018-01-06", tz="US/Pacific"),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
np.nan,
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
np.nan,
np.nan,
],
[
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
np.nan,
pd.Timestamp("2018-01-08").to_period("D"),
pd.Timestamp("2018-01-02").to_period("D"),
pd.Timestamp("2018-01-06").to_period("D"),
np.nan,
np.nan,
],
],
ids=lambda x: type(x[0]),
)
@pytest.mark.parametrize(
"ties_method,ascending,na_option,pct,exp",
[
(
"average",
True,
"keep",
False,
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
),
(
"average",
True,
"keep",
True,
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
),
(
"average",
False,
"keep",
False,
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
),
(
"average",
False,
"keep",
True,
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
),
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
(
"min",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
(
"max",
False,
"keep",
False,
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
(
"first",
True,
"keep",
False,
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
),
(
"first",
True,
"keep",
True,
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
),
(
"first",
False,
"keep",
False,
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
),
(
"first",
False,
"keep",
True,
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
),
(
"dense",
True,
"keep",
False,
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
),
(
"dense",
True,
"keep",
True,
[
1.0 / 3.0,
1.0 / 3.0,
np.nan,
3.0 / 3.0,
1.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
(
"dense",
False,
"keep",
False,
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
),
(
"dense",
False,
"keep",
True,
[
3.0 / 3.0,
3.0 / 3.0,
np.nan,
1.0 / 3.0,
3.0 / 3.0,
2.0 / 3.0,
np.nan,
np.nan,
],
),
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
(
"average",
True,
"bottom",
True,
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
),
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
(
"average",
False,
"bottom",
True,
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
),
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
(
"min",
True,
"bottom",
True,
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
),
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
(
"min",
False,
"bottom",
True,
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
),
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
(
"max",
False,
"bottom",
True,
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
),
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
(
"first",
True,
"bottom",
True,
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
),
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
(
"first",
False,
"bottom",
True,
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
),
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
],
)
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
key = np.repeat(grps, len(vals))
orig_vals = vals
vals = list(vals) * len(grps)
if isinstance(orig_vals, np.ndarray):
vals = np.array(vals, dtype=orig_vals.dtype)
df = DataFrame({"key": key, "val": vals})
result = df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
exp_df = DataFrame(exp * len(grps), columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
)
def test_rank_resets_each_group(pct, exp):
df = DataFrame(
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
)
result = df.groupby("key").rank(pct=pct)
exp_df = DataFrame(exp * 2, columns=["val"])
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize(
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
)
@pytest.mark.parametrize("upper", [True, False])
def test_rank_avg_even_vals(dtype, upper):
if upper:
# use IntegerDtype/FloatingDtype
dtype = dtype[0].upper() + dtype[1:]
dtype = dtype.replace("Ui", "UI")
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
df["val"] = df["val"].astype(dtype)
assert df["val"].dtype == dtype
result = df.groupby("key").rank()
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
if upper:
exp_df = exp_df.astype("Float64")
tm.assert_frame_equal(result, exp_df)
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
)
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
mask = df["val"].isna()
gb = df.groupby("key")
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
# construct our expected by using numeric values with the same ordering
if mask.any():
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
else:
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
gb2 = df2.groupby("key")
alt = gb2.rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
tm.assert_frame_equal(res, alt)
@pytest.mark.parametrize("na_option", [True, "bad", 1])
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
@pytest.mark.parametrize("ascending", [True, False])
@pytest.mark.parametrize("pct", [True, False])
@pytest.mark.parametrize(
"vals",
[
["bar", "bar", "foo", "bar", "baz"],
["bar", np.nan, "foo", np.nan, "baz"],
[1, np.nan, 2, np.nan, 3],
],
)
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
df = DataFrame({"key": ["foo"] * 5, "val": vals})
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
with pytest.raises(ValueError, match=msg):
df.groupby("key").rank(
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
)
def test_rank_empty_group():
# see gh-22519
column = "A"
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
result = df.groupby(column).B.rank(pct=True)
expected = Series([0.5, np.nan, 1.0], name="B")
tm.assert_series_equal(result, expected)
result = df.groupby(column).rank(pct=True)
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"input_key,input_value,output_value",
[
([1, 2], [1, 1], [1.0, 1.0]),
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
],
)
def test_rank_zero_div(input_key, input_value, output_value):
# GH 23666
df = DataFrame({"A": input_key, "B": input_value})
result = df.groupby("A").rank(method="dense", pct=True)
expected = DataFrame({"B": output_value})
tm.assert_frame_equal(result, expected)
def test_rank_min_int():
# GH-32859
df = DataFrame(
{
"grp": [1, 1, 2],
"int_col": [
np.iinfo(np.int64).min,
np.iinfo(np.int64).max,
np.iinfo(np.int64).min,
],
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
}
)
result = df.groupby("grp").rank()
expected = DataFrame(
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("use_nan", [True, False])
def test_rank_pct_equal_values_on_group_transition(use_nan):
# GH#40518
fill_value = np.nan if use_nan else 3
df = DataFrame(
[
[-1, 1],
[-1, 2],
[1, fill_value],
[-1, fill_value],
],
columns=["group", "val"],
)
result = df.groupby(["group"])["val"].rank(
method="dense",
pct=True,
)
if use_nan:
expected = Series([0.5, 1, np.nan, np.nan], name="val")
else:
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
tm.assert_series_equal(result, expected)
def test_rank_multiindex():
# GH27721
df = concat(
{
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
},
axis=1,
)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(level=0, axis=1)
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.rank(axis=1)
expected = concat(
[
df["a"].rank(axis=1),
df["b"].rank(axis=1),
],
axis=1,
keys=["a", "b"],
)
tm.assert_frame_equal(result, expected)
def test_groupby_axis0_rank_axis1():
# GH#41320
df = DataFrame(
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
index=["a", "a", "b", "b"],
)
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(level=0, axis=0)
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = gb.rank(axis=1)
# This should match what we get when "manually" operating group-by-group
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
tm.assert_frame_equal(res, expected)
# check that we haven't accidentally written a case that coincidentally
# matches rank(axis=0)
msg = "The 'axis' keyword in DataFrameGroupBy.rank"
with tm.assert_produces_warning(FutureWarning, match=msg):
alt = gb.rank(axis=0)
assert not alt.equals(expected)
def test_groupby_axis0_cummax_axis1():
# case where groupby axis is 0 and axis keyword in transform is 1
# df has mixed dtype -> multiple blocks
df = DataFrame(
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
index=["a", "a", "b", "b"],
)
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(level=0, axis=0)
msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
cmax = gb.cummax(axis=1)
expected = df[[0, 1]].astype(np.float64)
expected[2] = expected[1]
tm.assert_frame_equal(cmax, expected)
def test_non_unique_index():
# GH 16577
df = DataFrame(
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
)
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
expected = Series(
[1.0, 1.0, 1.0, np.nan],
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
name="value",
)
tm.assert_series_equal(result, expected)
def test_rank_categorical():
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
gb = df.groupby("col1")
res = gb.rank()
expected = df.astype(object).groupby("col1").rank()
tm.assert_frame_equal(res, expected)
@pytest.mark.parametrize("na_option", ["top", "bottom"])
def test_groupby_op_with_nullables(na_option):
# GH 54206
df = DataFrame({"x": [None]}, dtype="Float64")
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
expected = Series([1.0], dtype="Float64", name=result.name)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,154 @@
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
def test_groupby_sample_balanced_groups_shape(n, frac):
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=n, frac=frac)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=n, frac=frac)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_unbalanced_groups_shape():
values = [1] * 10 + [2] * 20
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=5)
values = [1] * 5 + [2] * 5
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=5)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_index_value_spans_groups():
values = [1] * 3 + [2] * 3
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
result = df.groupby("a").sample(n=2)
values = [1] * 2 + [2] * 2
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_n_and_frac_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Please enter a value for `frac` OR `n`, not both"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=1, frac=1.0)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=1, frac=1.0)
def test_groupby_sample_frac_gt_one_without_replacement_raises():
df = DataFrame({"a": [1, 2], "b": [1, 2]})
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(frac=1.5, replace=False)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(frac=1.5, replace=False)
@pytest.mark.parametrize("n", [-1, 1.5])
def test_groupby_sample_invalid_n_raises(n):
df = DataFrame({"a": [1, 2], "b": [1, 2]})
if n < 0:
msg = "A negative number of rows requested. Please provide `n` >= 0."
else:
msg = "Only integers accepted as `n` values"
with pytest.raises(ValueError, match=msg):
df.groupby("a").sample(n=n)
with pytest.raises(ValueError, match=msg):
df.groupby("a")["b"].sample(n=n)
def test_groupby_sample_oversample():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(frac=2.0, replace=True)
values = [1] * 20 + [2] * 20
expected = DataFrame({"a": values, "b": values}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
expected = Series(values, name="b", index=result.index)
tm.assert_series_equal(result, expected)
def test_groupby_sample_without_n_or_frac():
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values})
result = df.groupby("a").sample(n=None, frac=None)
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=None, frac=None)
expected = Series([1, 2], name="b", index=result.index)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"index, expected_index",
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
)
def test_groupby_sample_with_weights(index, expected_index):
# GH 39927 - tests for integer index needed
values = [1] * 2 + [2] * 2
df = DataFrame({"a": values, "b": values}, index=Index(index))
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
tm.assert_frame_equal(result, expected)
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
expected = Series(values, name="b", index=Index(expected_index))
tm.assert_series_equal(result, expected)
def test_groupby_sample_with_selections():
# GH 39928
values = [1] * 10 + [2] * 10
df = DataFrame({"a": values, "b": values, "c": values})
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
tm.assert_frame_equal(result, expected)
def test_groupby_sample_with_empty_inputs():
# GH48459
df = DataFrame({"a": [], "b": []})
groupby_df = df.groupby("a")
result = groupby_df.sample()
expected = df
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,130 @@
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_integer_dtype
from pandas import (
DataFrame,
Index,
PeriodIndex,
Series,
)
import pandas._testing as tm
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
def test_size(df, by):
grouped = df.groupby(by=by)
result = grouped.size()
for key, group in grouped:
assert result[key] == len(group)
@pytest.mark.parametrize(
"by",
[
[0, 0, 0, 0],
[0, 1, 1, 1],
[1, 0, 1, 1],
[0, None, None, None],
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
],
)
def test_size_axis_1(df, axis_1, by, sort, dropna):
# GH#45715
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
if dropna:
counts = {key: value for key, value in counts.items() if key is not None}
expected = Series(counts, dtype="int64")
if sort:
expected = expected.sort_index()
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
expected.index = expected.index.astype(int)
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
result = grouped.size()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
@pytest.mark.parametrize("sort", [True, False])
def test_size_sort(sort, by):
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
left = df.groupby(by=by, sort=sort).size()
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
tm.assert_series_equal(left, right, check_names=False)
def test_size_series_dataframe():
# https://github.com/pandas-dev/pandas/issues/11699
df = DataFrame(columns=["A", "B"])
out = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(df.groupby("A").size(), out)
def test_size_groupby_all_null():
# https://github.com/pandas-dev/pandas/issues/23050
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
df = DataFrame({"A": [None, None]}) # all-null groups
result = df.groupby("A").size()
expected = Series(dtype="int64", index=Index([], name="A"))
tm.assert_series_equal(result, expected)
def test_size_period_index():
# https://github.com/pandas-dev/pandas/issues/34010
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
grp = ser.groupby(level="A")
result = grp.size()
tm.assert_series_equal(result, ser)
@pytest.mark.parametrize("as_index", [True, False])
def test_size_on_categorical(as_index):
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
df["A"] = df["A"].astype("category")
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
expected = DataFrame(
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
)
expected["A"] = expected["A"].astype("category")
if as_index:
expected = expected.set_index(["A", "B"])["size"].rename(None)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
def test_size_series_masked_type_returns_Int64(dtype):
# GH 54132
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
result = ser.groupby(level=0).size()
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dtype",
[
object,
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
],
)
def test_size_strings(dtype):
# GH#55627
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
result = df.groupby("a")["b"].size()
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
expected = Series(
[2, 1],
index=Index(["a", "b"], name="a", dtype=dtype),
name="b",
dtype=exp_dtype,
)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
import pandas._testing as tm
def test_groupby_skew_equivalence():
# Test that that groupby skew method (which uses libgroupby.group_skew)
# matches the results of operating group-by-group (which uses nanops.nanskew)
nrows = 1000
ngroups = 3
ncols = 2
nan_frac = 0.05
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
df = pd.DataFrame(arr)
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
gb = df.groupby(grps)
result = gb.skew()
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
expected = pd.concat(grpwise, axis=0)
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,83 @@
"""
Tests that apply to all groupby operation methods.
The only tests that should appear here are those that use the `groupby_func` fixture.
Even if it does use that fixture, prefer a more specific test file if it available
such as:
- test_categorical
- test_groupby_dropna
- test_groupby_subclass
- test_raises
"""
import pytest
import pandas as pd
from pandas import DataFrame
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
def test_multiindex_group_all_columns_when_empty(groupby_func):
# GH 32464
df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
gb = df.groupby(["a", "b", "c"], group_keys=False)
method = getattr(gb, groupby_func)
args = get_groupby_method_args(groupby_func, df)
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = method(*args).index
expected = df.index
tm.assert_index_equal(result, expected)
def test_duplicate_columns(request, groupby_func, as_index):
# GH#50806
if groupby_func == "corrwith":
msg = "GH#50845 - corrwith fails when there are duplicate columns"
request.applymarker(pytest.mark.xfail(reason=msg))
df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby("a", as_index=as_index)
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = getattr(gb, groupby_func)(*args)
expected_df = df.set_axis(["a", "b", "c"], axis=1)
expected_args = get_groupby_method_args(groupby_func, expected_df)
expected_gb = expected_df.groupby("a", as_index=as_index)
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
expected = getattr(expected_gb, groupby_func)(*expected_args)
if groupby_func not in ("size", "ngroup", "cumcount"):
expected = expected.rename(columns={"c": "b"})
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"idx",
[
pd.Index(["a", "a"], name="foo"),
pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
],
)
def test_dup_labels_output_shape(groupby_func, idx):
if groupby_func in {"size", "ngroup", "cumcount"}:
pytest.skip(f"Not applicable for {groupby_func}")
df = DataFrame([[1, 1]], columns=idx)
grp_by = df.groupby([0])
args = get_groupby_method_args(groupby_func, df)
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = getattr(grp_by, groupby_func)(*args)
assert result.shape == (1, 2)
tm.assert_index_equal(result.columns, idx)

View File

@ -0,0 +1,265 @@
"""
Tests of the groupby API, including internal consistency and with other pandas objects.
Tests in this file should only check the existence, names, and arguments of groupby
methods. It should not test the results of any groupby operation.
"""
import inspect
import pytest
from pandas import (
DataFrame,
Series,
)
from pandas.core.groupby.base import (
groupby_other_methods,
reduction_kernels,
transformation_kernels,
)
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
def test_tab_completion(multiindex_dataframe_random_data):
grp = multiindex_dataframe_random_data.groupby(level="second")
results = {v for v in dir(grp) if not v.startswith("_")}
expected = {
"A",
"B",
"C",
"agg",
"aggregate",
"apply",
"boxplot",
"filter",
"first",
"get_group",
"groups",
"hist",
"indices",
"last",
"max",
"mean",
"median",
"min",
"ngroups",
"nth",
"ohlc",
"plot",
"prod",
"size",
"std",
"sum",
"transform",
"var",
"sem",
"count",
"nunique",
"head",
"describe",
"cummax",
"quantile",
"rank",
"cumprod",
"tail",
"resample",
"cummin",
"fillna",
"cumsum",
"cumcount",
"ngroup",
"all",
"shift",
"skew",
"take",
"pct_change",
"any",
"corr",
"corrwith",
"cov",
"dtypes",
"ndim",
"diff",
"idxmax",
"idxmin",
"ffill",
"bfill",
"rolling",
"expanding",
"pipe",
"sample",
"ewm",
"value_counts",
}
assert results == expected
def test_all_methods_categorized(multiindex_dataframe_random_data):
grp = multiindex_dataframe_random_data.groupby(
multiindex_dataframe_random_data.iloc[:, 0]
)
names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
multiindex_dataframe_random_data.columns
)
new_names = set(names)
new_names -= reduction_kernels
new_names -= transformation_kernels
new_names -= groupby_other_methods
assert not reduction_kernels & transformation_kernels
assert not reduction_kernels & groupby_other_methods
assert not transformation_kernels & groupby_other_methods
# new public method?
if new_names:
msg = f"""
There are uncategorized methods defined on the Grouper class:
{new_names}.
Was a new method recently added?
Every public method On Grouper must appear in exactly one the
following three lists defined in pandas.core.groupby.base:
- `reduction_kernels`
- `transformation_kernels`
- `groupby_other_methods`
see the comments in pandas/core/groupby/base.py for guidance on
how to fix this test.
"""
raise AssertionError(msg)
# removed a public method?
all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
if names != all_categorized:
msg = f"""
Some methods which are supposed to be on the Grouper class
are missing:
{all_categorized - names}.
They're still defined in one of the lists that live in pandas/core/groupby/base.py.
If you removed a method, you should update them
"""
raise AssertionError(msg)
def test_frame_consistency(groupby_func):
# GH#48028
if groupby_func in ("first", "last"):
msg = "first and last are entirely different between frame and groupby"
pytest.skip(reason=msg)
if groupby_func in ("cumcount", "ngroup"):
assert not hasattr(DataFrame, groupby_func)
return
frame_method = getattr(DataFrame, groupby_func)
gb_method = getattr(DataFrameGroupBy, groupby_func)
result = set(inspect.signature(gb_method).parameters)
if groupby_func == "size":
# "size" is a method on GroupBy but property on DataFrame:
expected = {"self"}
else:
expected = set(inspect.signature(frame_method).parameters)
# Exclude certain arguments from result and expected depending on the operation
# Some of these may be purposeful inconsistencies between the APIs
exclude_expected, exclude_result = set(), set()
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("count",):
exclude_expected = {"numeric_only", "axis"}
elif groupby_func in ("nunique",):
exclude_expected = {"axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs", "skipna"}
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"skipna", "args"}
exclude_result = {"numeric_only"}
elif groupby_func in ("cumprod", "cumsum"):
exclude_expected = {"skipna"}
elif groupby_func in ("pct_change",):
exclude_expected = {"kwargs"}
exclude_result = {"axis"}
elif groupby_func in ("rank",):
exclude_expected = {"numeric_only"}
elif groupby_func in ("quantile",):
exclude_expected = {"method", "axis"}
# Ensure excluded arguments are actually in the signatures
assert result & exclude_result == exclude_result
assert expected & exclude_expected == exclude_expected
result -= exclude_result
expected -= exclude_expected
assert result == expected
def test_series_consistency(request, groupby_func):
# GH#48028
if groupby_func in ("first", "last"):
pytest.skip("first and last are entirely different between Series and groupby")
if groupby_func in ("cumcount", "corrwith", "ngroup"):
assert not hasattr(Series, groupby_func)
return
series_method = getattr(Series, groupby_func)
gb_method = getattr(SeriesGroupBy, groupby_func)
result = set(inspect.signature(gb_method).parameters)
if groupby_func == "size":
# "size" is a method on GroupBy but property on Series
expected = {"self"}
else:
expected = set(inspect.signature(series_method).parameters)
# Exclude certain arguments from result and expected depending on the operation
# Some of these may be purposeful inconsistencies between the APIs
exclude_expected, exclude_result = set(), set()
if groupby_func in ("any", "all"):
exclude_expected = {"kwargs", "bool_only", "axis"}
elif groupby_func in ("diff",):
exclude_result = {"axis"}
elif groupby_func in ("max", "min"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"min_count", "engine", "engine_kwargs"}
elif groupby_func in ("mean", "std", "sum", "var"):
exclude_expected = {"axis", "kwargs", "skipna"}
exclude_result = {"engine", "engine_kwargs"}
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs", "skipna"}
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"skipna", "args"}
exclude_result = {"numeric_only"}
elif groupby_func in ("cumprod", "cumsum"):
exclude_expected = {"skipna"}
elif groupby_func in ("pct_change",):
exclude_expected = {"kwargs"}
exclude_result = {"axis"}
elif groupby_func in ("rank",):
exclude_expected = {"numeric_only"}
elif groupby_func in ("idxmin", "idxmax"):
exclude_expected = {"args", "kwargs"}
elif groupby_func in ("quantile",):
exclude_result = {"numeric_only"}
# Ensure excluded arguments are actually in the signatures
assert result & exclude_result == exclude_result
assert expected & exclude_expected == exclude_expected
result -= exclude_result
expected -= exclude_expected
assert result == expected

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,163 @@
import numpy as np
import pandas as pd
import pandas._testing as tm
def test_group_by_copy():
# GH#44803
df = pd.DataFrame(
{
"name": ["Alice", "Bob", "Carl"],
"age": [20, 21, 20],
}
).set_index("name")
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
grp_by_same_value = df.groupby(["age"], group_keys=False).apply(
lambda group: group
)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
grp_by_copy = df.groupby(["age"], group_keys=False).apply(
lambda group: group.copy()
)
tm.assert_frame_equal(grp_by_same_value, grp_by_copy)
def test_mutate_groups():
# GH3380
df = pd.DataFrame(
{
"cat1": ["a"] * 8 + ["b"] * 6,
"cat2": ["c"] * 2
+ ["d"] * 2
+ ["e"] * 2
+ ["f"] * 2
+ ["c"] * 2
+ ["d"] * 2
+ ["e"] * 2,
"cat3": [f"g{x}" for x in range(1, 15)],
"val": np.random.default_rng(2).integers(100, size=14),
}
)
def f_copy(x):
x = x.copy()
x["rank"] = x.val.rank(method="min")
return x.groupby("cat2")["rank"].min()
def f_no_copy(x):
x["rank"] = x.val.rank(method="min")
return x.groupby("cat2")["rank"].min()
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
grpby_copy = df.groupby("cat1").apply(f_copy)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
tm.assert_series_equal(grpby_copy, grpby_no_copy)
def test_no_mutate_but_looks_like():
# GH 8467
# first show's mutation indicator
# second does not, but should yield the same results
df = pd.DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].key)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result2 = df.groupby("key", group_keys=True).apply(lambda x: x.key)
tm.assert_series_equal(result1, result2)
def test_apply_function_with_indexing(warn_copy_on_write):
# GH: 33058
df = pd.DataFrame(
{"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
)
def fn(x):
x.loc[x.index[-1], "col2"] = 0
return x.col2
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(
DeprecationWarning, match=msg, raise_on_extra_warnings=not warn_copy_on_write
):
result = df.groupby(["col1"], as_index=False).apply(fn)
expected = pd.Series(
[1, 2, 0, 4, 5, 0],
index=pd.MultiIndex.from_tuples(
[(0, 0), (0, 1), (0, 2), (1, 3), (1, 4), (1, 5)]
),
name="col2",
)
tm.assert_series_equal(result, expected)
def test_apply_mutate_columns_multiindex():
# GH 12652
df = pd.DataFrame(
{
("C", "julian"): [1, 2, 3],
("B", "geoffrey"): [1, 2, 3],
("A", "julian"): [1, 2, 3],
("B", "julian"): [1, 2, 3],
("A", "geoffrey"): [1, 2, 3],
("C", "geoffrey"): [1, 2, 3],
},
columns=pd.MultiIndex.from_tuples(
[
("A", "julian"),
("A", "geoffrey"),
("B", "julian"),
("B", "geoffrey"),
("C", "julian"),
("C", "geoffrey"),
]
),
)
def add_column(grouped):
name = grouped.columns[0][1]
grouped["sum", name] = grouped.sum(axis=1)
return grouped
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby(level=1, axis=1)
result = gb.apply(add_column)
expected = pd.DataFrame(
[
[1, 1, 1, 3, 1, 1, 1, 3],
[2, 2, 2, 6, 2, 2, 2, 6],
[
3,
3,
3,
9,
3,
3,
3,
9,
],
],
columns=pd.MultiIndex.from_tuples(
[
("geoffrey", "A", "geoffrey"),
("geoffrey", "B", "geoffrey"),
("geoffrey", "C", "geoffrey"),
("geoffrey", "sum", "geoffrey"),
("julian", "A", "julian"),
("julian", "B", "julian"),
("julian", "C", "julian"),
("julian", "sum", "julian"),
]
),
)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,65 @@
import numpy as np
import pytest
from pandas._libs import lib
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
def assert_block_lengths(x):
assert len(x) == len(x._mgr.blocks[0].mgr_locs)
return 0
def cumsum_max(x):
x.cumsum().max()
return 0
@pytest.mark.parametrize(
"func",
[
cumsum_max,
pytest.param(assert_block_lengths, marks=td.skip_array_manager_invalid_test),
],
)
def test_mgr_locs_updated(func):
# https://github.com/pandas-dev/pandas/issues/31802
# Some operations may require creating new blocks, which requires
# valid mgr_locs
df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
result = df.groupby(["A", "B"]).agg(func)
expected = pd.DataFrame(
{"C": [0, 0]},
index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"binner,closed,expected",
[
(
np.array([0, 3, 6, 9], dtype=np.int64),
"left",
np.array([2, 5, 6], dtype=np.int64),
),
(
np.array([0, 3, 6, 9], dtype=np.int64),
"right",
np.array([3, 6, 6], dtype=np.int64),
),
(np.array([0, 3, 6], dtype=np.int64), "left", np.array([2, 5], dtype=np.int64)),
(
np.array([0, 3, 6], dtype=np.int64),
"right",
np.array([3, 6], dtype=np.int64),
),
],
)
def test_generate_bins(binner, closed, expected):
values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
result = lib.generate_bins_dt64(values, binner, closed=closed)
tm.assert_numpy_array_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,394 @@
from itertools import product
from string import ascii_lowercase
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
MultiIndex,
Period,
Series,
Timedelta,
Timestamp,
date_range,
)
import pandas._testing as tm
class TestCounting:
def test_cumcount(self):
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3])
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.cumcount())
tm.assert_series_equal(e, se.cumcount())
def test_cumcount_dupe_index(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=mi)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_cumcount_groupby_not_col(self):
df = DataFrame(
[["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
tm.assert_series_equal(expected, g.cumcount())
tm.assert_series_equal(expected, sg.cumcount())
def test_ngroup(self):
df = DataFrame({"A": list("aaaba")})
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_distinct(self):
df = DataFrame({"A": list("abcde")})
g = df.groupby("A")
sg = g.A
expected = Series(range(5), dtype="int64")
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_one_group(self):
df = DataFrame({"A": [0] * 5})
g = df.groupby("A")
sg = g.A
expected = Series([0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_empty(self):
ge = DataFrame().groupby(level=0)
se = Series(dtype=object).groupby(level=0)
# edge case, as this is usually considered float
e = Series(dtype="int64")
tm.assert_series_equal(e, ge.ngroup())
tm.assert_series_equal(e, se.ngroup())
def test_ngroup_series_matches_frame(self):
df = DataFrame({"A": list("aaaba")})
s = Series(list("aaaba"))
tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
def test_ngroup_dupe_index(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_mi(self):
mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
df = DataFrame({"A": list("aaaba")}, index=mi)
g = df.groupby("A")
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=mi)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_groupby_not_col(self):
df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
g = df.groupby([0, 0, 0, 1, 0])
sg = g.A
expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
tm.assert_series_equal(expected, g.ngroup())
tm.assert_series_equal(expected, sg.ngroup())
def test_ngroup_descending(self):
df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
g = df.groupby(["A"])
ascending = Series([0, 0, 1, 0, 1])
descending = Series([1, 1, 0, 1, 0])
tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
tm.assert_series_equal(ascending, g.ngroup(ascending=True))
tm.assert_series_equal(descending, g.ngroup(ascending=False))
def test_ngroup_matches_cumcount(self):
# verify one manually-worked out case works
df = DataFrame(
[["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
columns=["A", "X"],
)
g = df.groupby(["A", "X"])
g_ngroup = g.ngroup()
g_cumcount = g.cumcount()
expected_ngroup = Series([0, 1, 2, 0, 3])
expected_cumcount = Series([0, 0, 0, 1, 0])
tm.assert_series_equal(g_ngroup, expected_ngroup)
tm.assert_series_equal(g_cumcount, expected_cumcount)
def test_ngroup_cumcount_pair(self):
# brute force comparison for all small series
for p in product(range(3), repeat=4):
df = DataFrame({"a": p})
g = df.groupby(["a"])
order = sorted(set(p))
ngroupd = [order.index(val) for val in p]
cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
tm.assert_series_equal(g.ngroup(), Series(ngroupd))
tm.assert_series_equal(g.cumcount(), Series(cumcounted))
def test_ngroup_respects_groupby_order(self, sort):
df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
g = df.groupby("a", sort=sort)
df["group_id"] = -1
df["group_index"] = -1
for i, (_, group) in enumerate(g):
df.loc[group.index, "group_id"] = i
for j, ind in enumerate(group.index):
df.loc[ind, "group_index"] = j
tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
@pytest.mark.parametrize(
"datetimelike",
[
[Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
[Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
[Timedelta(x, unit="h") for x in range(1, 4)],
[Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
],
)
def test_count_with_datetimelike(self, datetimelike):
# test for #13393, where DataframeGroupBy.count() fails
# when counting a datetimelike column.
df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
res = df.groupby("x").count()
expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
expected.index.name = "x"
tm.assert_frame_equal(expected, res)
def test_count_with_only_nans_in_first_group(self):
# GH21956
df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
result = df.groupby(["A", "B"]).C.count()
mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
expected = Series([], index=mi, dtype=np.int64, name="C")
tm.assert_series_equal(result, expected, check_index_type=False)
def test_count_groupby_column_with_nan_in_groupby_column(self):
# https://github.com/pandas-dev/pandas/issues/32841
df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
res = df.groupby(["B"]).count()
expected = DataFrame(
index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
)
tm.assert_frame_equal(expected, res)
def test_groupby_count_dateparseerror(self):
dr = date_range(start="1/1/2012", freq="5min", periods=10)
# BAD Example, datetimes first
ser = Series(np.arange(10), index=[dr, np.arange(10)])
grouped = ser.groupby(lambda x: x[1] % 2 == 0)
result = grouped.count()
ser = Series(np.arange(10), index=[np.arange(10), dr])
grouped = ser.groupby(lambda x: x[0] % 2 == 0)
expected = grouped.count()
tm.assert_series_equal(result, expected)
def test_groupby_timedelta_cython_count():
df = DataFrame(
{"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
)
expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
result = df.groupby("g").delta.count()
tm.assert_series_equal(expected, result)
def test_count():
n = 1 << 15
dr = date_range("2015-08-30", periods=n // 10, freq="min")
df = DataFrame(
{
"1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
"2nd": np.random.default_rng(2).integers(0, 5, n),
"3rd": np.random.default_rng(2).standard_normal(n).round(3),
"4th": np.random.default_rng(2).integers(-10, 10, n),
"5th": np.random.default_rng(2).choice(dr, n),
"6th": np.random.default_rng(2).standard_normal(n).round(3),
"7th": np.random.default_rng(2).standard_normal(n).round(3),
"8th": np.random.default_rng(2).choice(dr, n)
- np.random.default_rng(2).choice(dr, 1),
"9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
}
)
for col in df.columns.drop(["1st", "2nd", "4th"]):
df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
df["9th"] = df["9th"].astype("category")
for key in ["1st", "2nd", ["1st", "2nd"]]:
left = df.groupby(key).count()
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
tm.assert_frame_equal(left, right)
def test_count_non_nulls():
# GH#5610
# count counts non-nulls
df = DataFrame(
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
columns=["A", "B", "C"],
)
count_as = df.groupby("A").count()
count_not_as = df.groupby("A", as_index=False).count()
expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
expected.index.name = "A"
tm.assert_frame_equal(count_not_as, expected.reset_index())
tm.assert_frame_equal(count_as, expected)
count_B = df.groupby("A")["B"].count()
tm.assert_series_equal(count_B, expected["B"])
def test_count_object():
df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
result = df.groupby("c").a.count()
expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
tm.assert_series_equal(result, expected)
def test_count_cross_type():
# GH8169
# Set float64 dtype to avoid upcast when setting nan below
vals = np.hstack(
(
np.random.default_rng(2).integers(0, 5, (100, 2)),
np.random.default_rng(2).integers(0, 2, (100, 2)),
)
).astype("float64")
df = DataFrame(vals, columns=["a", "b", "c", "d"])
df[df == 2] = np.nan
expected = df.groupby(["c", "d"]).count()
for t in ["float32", "object"]:
df["a"] = df["a"].astype(t)
df["b"] = df["b"].astype(t)
result = df.groupby(["c", "d"]).count()
tm.assert_frame_equal(result, expected)
def test_lower_int_prec_count():
df = DataFrame(
{
"a": np.array([0, 1, 2, 100], np.int8),
"b": np.array([1, 2, 3, 6], np.uint32),
"c": np.array([4, 5, 6, 8], np.int16),
"grp": list("ab" * 2),
}
)
result = df.groupby("grp").count()
expected = DataFrame(
{"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
)
tm.assert_frame_equal(result, expected)
def test_count_uses_size_on_exception():
class RaisingObjectException(Exception):
pass
class RaisingObject:
def __init__(self, msg="I will raise inside Cython") -> None:
super().__init__()
self.msg = msg
def __eq__(self, other):
# gets called in Cython to check that raising calls the method
raise RaisingObjectException(self.msg)
df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
result = df.groupby("grp").count()
expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
tm.assert_frame_equal(result, expected)
def test_count_arrow_string_array(any_string_dtype):
# GH#54751
pytest.importorskip("pyarrow")
df = DataFrame(
{"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
)
result = df.groupby("a").count()
expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,319 @@
import numpy as np
import pytest
from pandas.errors import UnsupportedFunctionCall
import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
)
def dtypes_for_minmax(request):
"""
Fixture of dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param
np_type = dtype
if dtype == "Int64":
np_type = np.int64
elif dtype == "Float64":
np_type = np.float64
min_val = (
np.iinfo(np_type).min
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).min
)
max_val = (
np.iinfo(np_type).max
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).max
)
return (dtype, min_val, max_val)
def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
df = DataFrame({"key": ["b"] * 100, "value": 2})
df["value"] = df["value"].astype(float)
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)
@pytest.mark.skip_ubsan
def test_groupby_cumprod_overflow():
# GH#37493 if we overflow we return garbage consistent with numpy
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
actual = df.groupby("key")["value"].cumprod()
expected = Series(
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
name="value",
)
tm.assert_series_equal(actual, expected)
numpy_result = df.groupby("key", group_keys=False)["value"].apply(
lambda x: x.cumprod()
)
numpy_result.name = "value"
tm.assert_series_equal(actual, numpy_result)
def test_groupby_cumprod_nan_influences_other_columns():
# GH#48064
df = DataFrame(
{
"a": 1,
"b": [1, np.nan, 2],
"c": [1, 2, 3.0],
}
)
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
tm.assert_frame_equal(result, expected)
def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected, check_exact=True)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected, check_exact=True)
# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
def test_cummin_max_all_nan_column(method, dtype):
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
base_df["B"] = base_df["B"].astype(dtype)
grouped = base_df.groupby("A")
expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
result = getattr(grouped, method)()
tm.assert_frame_equal(expected, result)
result = getattr(grouped["B"], method)().to_frame()
tm.assert_frame_equal(expected, result)
def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]
# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
df = base_df.astype(dtype)
expected = DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)
# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)
# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")
result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)
# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)
def test_cummax_i8_at_implementation_bound():
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
# for int64 dtype GH#46382
ser = Series([pd.NaT._value + n for n in range(5)])
df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
gb = df.groupby("A")
res = gb.cummax()
exp = df[["B", "C"]]
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
orig = df.copy()
gb = df.groupby(groups)["a"]
result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")
# check we didn't accidentally alter df
tm.assert_frame_equal(df, orig)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_max_skipna_multiple_cols(method):
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
gb = df.groupby([1, 1, 1])[["a", "b"]]
result = getattr(gb, method)(skipna=False)
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
g = df.groupby("A")
msg = "numpy operations are not valid with groupby"
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(1, 2, 3)
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(foo=1)
@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
)
def test_nullable_int_not_cast_as_float(method, dtype, val):
data = [val, pd.NA]
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
grouped = df.groupby("grp")
result = grouped.transform(method)
expected = DataFrame({"b": data}, dtype=dtype)
tm.assert_frame_equal(result, expected)
def test_cython_api2():
# this takes the fast apply path
# cumsum (GH5614)
df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
result = df.groupby("A").cumsum()
tm.assert_frame_equal(result, expected)
# GH 5755 - cumsum is a transformer and should ignore as_index
result = df.groupby("A", as_index=False).cumsum()
tm.assert_frame_equal(result, expected)
# GH 13994
msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").cumsum(axis=1)
expected = df.cumsum(axis=1)
tm.assert_frame_equal(result, expected)
msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.groupby("A").cumprod(axis=1)
expected = df.cumprod(axis=1)
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,636 @@
from string import ascii_lowercase
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Series,
Timestamp,
)
import pandas._testing as tm
def test_filter_series():
s = Series([1, 3, 20, 5, 22, 24, 7])
expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = Series([20, 22, 24], index=[2, 4, 5])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(s.index),
)
tm.assert_series_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(s.index),
)
def test_filter_single_column_df():
df = DataFrame([1, 3, 20, 5, 22, 24, 7])
expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
grouper = df[0].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
# Test dropna=False.
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() < 10, dropna=False),
expected_odd.reindex(df.index),
)
tm.assert_frame_equal(
grouped.filter(lambda x: x.mean() > 10, dropna=False),
expected_even.reindex(df.index),
)
def test_filter_multi_column_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
tm.assert_frame_equal(
grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
)
def test_filter_mixed_df():
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
def test_filter_out_all_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
def test_filter_out_no_groups():
s = Series([1, 3, 20, 5, 22, 24, 7])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
filtered = grouped.filter(lambda x: x.mean() > 0)
tm.assert_series_equal(filtered, s)
df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
grouper = df["A"].apply(lambda x: x % 2)
grouped = df.groupby(grouper)
filtered = grouped.filter(lambda x: x["A"].mean() > 0)
tm.assert_frame_equal(filtered, df)
def test_filter_out_all_groups_in_df():
# GH12768
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
tm.assert_frame_equal(expected, res)
df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
res = df.groupby("a")
res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
expected = DataFrame({"a": [], "b": []}, dtype="int64")
tm.assert_frame_equal(expected, res)
def test_filter_condition_raises():
def raise_if_sum_is_zero(x):
if x.sum() == 0:
raise ValueError
return x.sum() > 0
s = Series([-1, 0, 1, 2])
grouper = s.apply(lambda x: x % 2)
grouped = s.groupby(grouper)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
grouped.filter(raise_if_sum_is_zero)
def test_filter_with_axis_in_groupby():
# issue 11041
index = pd.MultiIndex.from_product([range(10), [0, 1]])
data = DataFrame(np.arange(100).reshape(-1, 20), columns=index, dtype="int64")
msg = "DataFrame.groupby with axis=1"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = data.groupby(level=0, axis=1)
result = gb.filter(lambda x: x.iloc[0, 0] > 10)
expected = data.iloc[:, 12:20]
tm.assert_frame_equal(result, expected)
def test_filter_bad_shapes():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby("B")
g_s = s.groupby(s)
f = lambda x: x
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: x == 1
msg = "filter function returned a DataFrame, but expected a scalar bool"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
f = lambda x: np.outer(x, x)
msg = "can't multiply sequence by non-int of type 'str'"
with pytest.raises(TypeError, match=msg):
g_df.filter(f)
msg = "the filter must return a boolean result"
with pytest.raises(TypeError, match=msg):
g_s.filter(f)
def test_filter_nan_is_false():
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
s = df["B"]
g_df = df.groupby(df["B"])
g_s = s.groupby(s)
f = lambda x: np.nan
tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
tm.assert_series_equal(g_s.filter(f), s[[]])
def test_filter_pdna_is_false():
# in particular, dont raise in filter trying to call bool(pd.NA)
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
ser = df["B"]
g_df = df.groupby(df["B"])
g_s = ser.groupby(ser)
func = lambda x: pd.NA
res = g_df.filter(func)
tm.assert_frame_equal(res, df.loc[[]])
res = g_s.filter(func)
tm.assert_series_equal(res, ser[[]])
def test_filter_against_workaround_ints():
# Series of ints
s = Series(np.random.default_rng(2).integers(0, 100, 100))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
def test_filter_against_workaround_floats():
# Series of floats
s = 100 * Series(np.random.default_rng(2).random(100))
grouper = s.apply(lambda x: np.round(x, -1))
grouped = s.groupby(grouper)
f = lambda x: x.mean() > 10
old_way = s[grouped.transform(f).astype("bool")]
new_way = grouped.filter(f)
tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
def test_filter_against_workaround_dataframe():
# Set up DataFrame of ints, floats, strings.
letters = np.array(list(ascii_lowercase))
N = 100
random_letters = letters.take(
np.random.default_rng(2).integers(0, 26, N, dtype=int)
)
df = DataFrame(
{
"ints": Series(np.random.default_rng(2).integers(0, 100, N)),
"floats": N / 10 * Series(np.random.default_rng(2).random(N)),
"letters": Series(random_letters),
}
)
# Group by ints; filter on floats.
grouped = df.groupby("ints")
old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["floats"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
# Group by floats (rounded); filter on strings.
grouper = df.floats.apply(lambda x: np.round(x, -1))
grouped = df.groupby(grouper)
old_way = df[grouped.letters.transform(lambda x: len(x) < N / 10).astype("bool")]
new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
tm.assert_frame_equal(new_way, old_way)
# Group by strings; filter on ints.
grouped = df.groupby("letters")
old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 20).astype("bool")]
new_way = grouped.filter(lambda x: x["ints"].mean() > N / 20)
tm.assert_frame_equal(new_way, old_way)
def test_filter_using_len():
# BUG GH4447
df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
grouped = df.groupby("B")
actual = grouped.filter(lambda x: len(x) > 2)
expected = DataFrame(
{"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
index=np.arange(2, 6, dtype=np.int64),
)
tm.assert_frame_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = df.loc[[]]
tm.assert_frame_equal(actual, expected)
# Series have always worked properly, but we'll test anyway.
s = df["B"]
grouped = s.groupby(s)
actual = grouped.filter(lambda x: len(x) > 2)
expected = Series(4 * ["b"], index=np.arange(2, 6, dtype=np.int64), name="B")
tm.assert_series_equal(actual, expected)
actual = grouped.filter(lambda x: len(x) > 4)
expected = s[[]]
tm.assert_series_equal(actual, expected)
def test_filter_maintains_ordering():
# Simple case: index is sequential. #4621
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]}
)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Now index is sequentially decreasing.
df.index = np.arange(len(df) - 1, -1, -1)
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
# Index is shuffled.
SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3]
df.index = df.index[SHUFFLED]
s = df["pid"]
grouped = df.groupby("tag")
actual = grouped.filter(lambda x: len(x) > 1)
expected = df.iloc[[1, 2, 4, 7]]
tm.assert_frame_equal(actual, expected)
grouped = s.groupby(df["tag"])
actual = grouped.filter(lambda x: len(x) > 1)
expected = s.iloc[[1, 2, 4, 7]]
tm.assert_series_equal(actual, expected)
def test_filter_multiple_timestamp():
# GH 10114
df = DataFrame(
{
"A": np.arange(5, dtype="int64"),
"B": ["foo", "bar", "foo", "bar", "bar"],
"C": Timestamp("20130101"),
}
)
grouped = df.groupby(["B", "C"])
result = grouped["A"].filter(lambda x: True)
tm.assert_series_equal(df["A"], result)
result = grouped["A"].transform(len)
expected = Series([2, 3, 2, 3, 3], name="A")
tm.assert_series_equal(result, expected)
result = grouped.filter(lambda x: True)
tm.assert_frame_equal(df, result)
result = grouped.transform("sum")
expected = DataFrame({"A": [2, 8, 2, 8, 8]})
tm.assert_frame_equal(result, expected)
result = grouped.transform(len)
expected = DataFrame({"A": [2, 3, 2, 3, 3]})
tm.assert_frame_equal(result, expected)
def test_filter_and_transform_with_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 1, 1, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_multiple_non_unique_int_index():
# GH4620
index = [1, 1, 1, 2, 0, 0, 0, 1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_float_index():
# GH4620
index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_timestamp_index():
# GH4620
t0 = Timestamp("2013-09-30 00:05:00")
t1 = Timestamp("2013-10-30 00:05:00")
t2 = Timestamp("2013-11-30 00:05:00")
index = [t1, t1, t1, t2, t1, t1, t0, t1]
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_and_transform_with_non_unique_string_index():
# GH4620
index = list("bbbcbbab")
df = DataFrame(
{"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
index=index,
)
grouped_df = df.groupby("tag")
ser = df["pid"]
grouped_ser = ser.groupby(df["tag"])
expected_indexes = [1, 2, 4, 7]
# Filter DataFrame
actual = grouped_df.filter(lambda x: len(x) > 1)
expected = df.iloc[expected_indexes]
tm.assert_frame_equal(actual, expected)
actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
# Cast to avoid upcast when setting nan below
expected = df.copy().astype("float64")
expected.iloc[[0, 3, 5, 6]] = np.nan
tm.assert_frame_equal(actual, expected)
# Filter Series
actual = grouped_ser.filter(lambda x: len(x) > 1)
expected = ser.take(expected_indexes)
tm.assert_series_equal(actual, expected)
actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
# ^ made manually because this can get confusing!
tm.assert_series_equal(actual, expected)
# Transform Series
actual = grouped_ser.transform(len)
expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
tm.assert_series_equal(actual, expected)
# Transform (a column from) DataFrameGroupBy
actual = grouped_df.pid.transform(len)
tm.assert_series_equal(actual, expected)
def test_filter_has_access_to_grouped_cols():
df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
g = df.groupby("A")
# previously didn't have access to col A #????
filt = g.filter(lambda x: x["A"].sum() == 2)
tm.assert_frame_equal(filt, df.iloc[[0, 1]])
def test_filter_enforces_scalarness():
df = DataFrame(
[
["best", "a", "x"],
["worst", "b", "y"],
["best", "c", "x"],
["best", "d", "y"],
["worst", "d", "y"],
["worst", "d", "y"],
["best", "d", "z"],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("c").filter(lambda g: g["a"] == "best")
def test_filter_non_bool_raises():
df = DataFrame(
[
["best", "a", 1],
["worst", "b", 1],
["best", "c", 1],
["best", "d", 1],
["worst", "d", 1],
["worst", "d", 1],
["best", "d", 1],
],
columns=["a", "b", "c"],
)
with pytest.raises(TypeError, match="filter function returned a.*"):
df.groupby("a").filter(lambda g: g.c.mean())
def test_filter_dropna_with_empty_groups():
# GH 10780
data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
grouped = data.groupby(level=0)
result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
tm.assert_series_equal(result_false, expected_false)
result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
tm.assert_series_equal(result_true, expected_true)
def test_filter_consistent_result_before_after_agg_func():
# GH 17091
df = DataFrame({"data": range(6), "key": list("ABCABC")})
grouper = df.groupby("key")
result = grouper.filter(lambda x: True)
expected = DataFrame({"data": range(6), "key": list("ABCABC")})
tm.assert_frame_equal(result, expected)
grouper.sum()
result = grouper.filter(lambda x: True)
tm.assert_frame_equal(result, expected)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,696 @@
import numpy as np
import pytest
from pandas.compat.pyarrow import pa_version_under10p1
from pandas.core.dtypes.missing import na_value_for_dtype
import pandas as pd
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [13.0, 233.0, 123.0],
"e": [13.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
dropna, tuples, outputs, nulls_fixture
):
# GH 3729 this is to test that NA is in one group
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
{
"c": [12.0, 13.3, 123.23, 1.0],
"d": [12.0, 234.0, 123.0, 1.0],
"e": [12.0, 13.0, 1.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
dropna, tuples, outputs, nulls_fixture, nulls_fixture2
):
# GH 3729 this is to test that NA in different groups with different representations
df_list = [
["A", "B", 12, 12, 12],
["A", nulls_fixture, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
[nulls_fixture2, "B", 1, 1, 1.0],
["A", nulls_fixture2, 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
grouped = df.groupby(["a", "b"], dropna=dropna).sum()
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, outputs",
[
(True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
(
False,
["A", "B", np.nan],
{
"b": [123.23, 13.0, 12.3],
"c": [123.0, 13.0, 233.0],
"d": [1.0, 13.0, 12.0],
},
),
],
)
def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
# GH 3729
df_list = [
["B", 12, 12, 12],
[None, 12.3, 233.0, 12],
["A", 123.23, 123, 1],
["B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
grouped = df.groupby("a", dropna=dropna).sum()
expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, idx, expected",
[
(True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
(
False,
["a", "a", "b", np.nan],
pd.Series([3, 3, 3], index=["a", "b", np.nan]),
),
],
)
def test_groupby_dropna_series_level(dropna, idx, expected):
ser = pd.Series([1, 2, 3, 3], index=idx)
result = ser.groupby(level=0, dropna=dropna).sum()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, expected",
[
(True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
(
False,
pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
),
],
)
def test_groupby_dropna_series_by(dropna, expected):
ser = pd.Series(
[390.0, 350.0, 30.0, 20.0],
index=["Falcon", "Falcon", "Parrot", "Parrot"],
name="Max Speed",
)
result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("dropna", (False, True))
def test_grouper_dropna_propagation(dropna):
# GH 36604
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
gb = df.groupby("A", dropna=dropna)
assert gb._grouper.dropna == dropna
@pytest.mark.parametrize(
"index",
[
pd.RangeIndex(0, 4),
list("abcd"),
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
],
)
def test_groupby_dataframe_slice_then_transform(dropna, index):
# GH35014 & GH35612
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
gb = df.groupby("A", dropna=dropna)
result = gb.transform(len)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)
result = gb[["B"]].transform(len)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)
result = gb["B"].transform(len)
expected = pd.Series(expected_data["B"], index=index, name="B")
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
(
True,
[["A", "B"], ["B", "A"]],
{"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
),
(
False,
[["A", "B"], ["A", np.nan], ["B", "A"]],
{
"c": [13.0, 12.3, 123.23],
"d": [12.0, 233.0, 123.0],
"e": [1.0, 12.0, 1.0],
},
),
],
)
def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
# GH 3729
df_list = [
["A", "B", 12, 12, 12],
["A", None, 12.3, 233.0, 12],
["B", "A", 123.23, 123, 1],
["A", "B", 1, 1, 1.0],
]
df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
agg_dict = {"c": "sum", "d": "max", "e": "min"}
grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna:
mi = mi.set_levels(["A", "B", np.nan], level="b")
expected = pd.DataFrame(outputs, index=mi)
tm.assert_frame_equal(grouped, expected)
@pytest.mark.arm_slow
@pytest.mark.parametrize(
"datetime1, datetime2",
[
(pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
(pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
],
)
@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
def test_groupby_dropna_datetime_like_data(
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
):
# 3729
df = pd.DataFrame(
{
"values": [1, 2, 3, 4, 5, 6],
"dt": [
datetime1,
unique_nulls_fixture,
datetime2,
unique_nulls_fixture2,
datetime1,
datetime1,
],
}
)
if dropna:
indexes = [datetime1, datetime2]
else:
indexes = [datetime1, datetime2, np.nan]
grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
tm.assert_frame_equal(grouped, expected)
@pytest.mark.parametrize(
"dropna, data, selected_data, levels",
[
pytest.param(
False,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
["a", "b", np.nan],
id="dropna_false_has_nan",
),
pytest.param(
True,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0]},
None,
id="dropna_true_has_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
False,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_false_no_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
True,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_true_no_nan",
),
],
)
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
# GH 35889
df = pd.DataFrame(data)
gb = df.groupby("groups", dropna=dropna)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna and levels:
mi = mi.set_levels(levels, level="groups")
expected = pd.DataFrame(selected_data, index=mi)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
@pytest.mark.parametrize("series", [True, False])
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
# GH#46783
obj = pd.DataFrame(
{
"a": [1, np.nan],
"b": [1, 1],
"c": [2, 3],
}
)
expected = obj.set_index(keys)
if series:
expected = expected["c"]
elif input_index == ["a", "b"] and keys == ["a"]:
# Column b should not be aggregated
expected = expected[["c"]]
if input_index is not None:
obj = obj.set_index(input_index)
gb = obj.groupby(keys, dropna=False)
if series:
gb = gb["c"]
result = gb.sum()
tm.assert_equal(result, expected)
def test_groupby_nan_included():
# GH 35646
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
df = pd.DataFrame(data)
grouped = df.groupby("group", dropna=False)
result = grouped.indices
dtype = np.intp
expected = {
"g1": np.array([0, 2], dtype=dtype),
"g2": np.array([3], dtype=dtype),
np.nan: np.array([1, 4], dtype=dtype),
}
for result_values, expected_values in zip(result.values(), expected.values()):
tm.assert_numpy_array_equal(result_values, expected_values)
assert np.isnan(list(result.keys())[2])
assert list(result.keys())[0:2] == ["g1", "g2"]
def test_groupby_drop_nan_with_multi_index():
# GH 39895
df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
df = df.set_index(["a", "b"])
result = df.groupby(["a", "b"], dropna=False).first()
expected = df
tm.assert_frame_equal(result, expected)
# sequence_index enumerates all strings made up of x, y, z of length 4
@pytest.mark.parametrize("sequence_index", range(3**4))
@pytest.mark.parametrize(
"dtype",
[
None,
"UInt8",
"Int8",
"UInt16",
"Int16",
"UInt32",
"Int32",
"UInt64",
"Int64",
"Float32",
"Int64",
"Float64",
"category",
"string",
pytest.param(
"string[pyarrow]",
marks=pytest.mark.skipif(
pa_version_under10p1, reason="pyarrow is not installed"
),
),
"datetime64[ns]",
"period[d]",
"Sparse[float]",
],
)
@pytest.mark.parametrize("test_series", [True, False])
def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
# GH#46584, GH#48794
# Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
# This sequence is used for the grouper.
sequence = "".join(
[{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
)
# Unique values to use for grouper, depends on dtype
if dtype in ("string", "string[pyarrow]"):
uniques = {"x": "x", "y": "y", "z": pd.NA}
elif dtype in ("datetime64[ns]", "period[d]"):
uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
else:
uniques = {"x": 1, "y": 2, "z": np.nan}
df = pd.DataFrame(
{
"key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
"a": [0, 1, 2, 3],
}
)
gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
if test_series:
gb = gb["a"]
result = gb.sum()
# Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
# issues with hashing np.nan
summed = {}
for idx, label in enumerate(sequence):
summed[label] = summed.get(label, 0) + idx
if dtype == "category":
index = pd.CategoricalIndex(
[uniques[e] for e in summed],
df["key"].cat.categories,
name="key",
)
elif isinstance(dtype, str) and dtype.startswith("Sparse"):
index = pd.Index(
pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
)
else:
index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
if not test_series:
expected = expected.to_frame()
if not as_index:
expected = expected.reset_index()
if dtype is not None and dtype.startswith("Sparse"):
expected["key"] = expected["key"].astype(dtype)
tm.assert_equal(result, expected)
@pytest.mark.parametrize("test_series", [True, False])
@pytest.mark.parametrize("dtype", [object, None])
def test_null_is_null_for_dtype(
sort, dtype, nulls_fixture, nulls_fixture2, test_series
):
# GH#48506 - groups should always result in using the null for the dtype
df = pd.DataFrame({"a": [1, 2]})
groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
obj = df["a"] if test_series else df
gb = obj.groupby(groups, dropna=False, sort=sort)
result = gb.sum()
index = pd.Index([na_value_for_dtype(groups.dtype)])
expected = pd.DataFrame({"a": [3]}, index=index)
if test_series:
tm.assert_series_equal(result, expected["a"])
else:
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
# Ensure there is at least one null value by appending to the end
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
)
# Strategy: Compare to dropna=True by filling null values with a new code
df_filled = df.copy()
df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
if index_kind == "range":
keys = ["x"]
elif index_kind == "single":
keys = ["x"]
df = df.set_index("x")
df_filled = df_filled.set_index("x")
else:
keys = ["x", "x2"]
df["x2"] = df["x"]
df = df.set_index(["x", "x2"])
df_filled["x2"] = df_filled["x"]
df_filled = df_filled.set_index(["x", "x2"])
args = get_groupby_method_args(reduction_func, df)
args_filled = get_groupby_method_args(reduction_func, df_filled)
if reduction_func == "corrwith" and index_kind == "range":
# Don't include the grouping columns so we can call reset_index
args = (args[0].drop(columns=keys),)
args_filled = (args_filled[0].drop(columns=keys),)
gb_keepna = df.groupby(
keys, dropna=False, observed=observed, sort=sort, as_index=as_index
)
if not observed and reduction_func in ["idxmin", "idxmax"]:
with pytest.raises(
ValueError, match="empty group due to unobserved categories"
):
getattr(gb_keepna, reduction_func)(*args)
return
gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
expected["x"] = expected["x"].cat.remove_categories([4])
if index_kind == "multi":
expected["x2"] = expected["x2"].cat.remove_categories([4])
if as_index:
if index_kind == "multi":
expected = expected.set_index(["x", "x2"])
else:
expected = expected.set_index("x")
elif index_kind != "range" and reduction_func != "size":
# size, unlike other methods, has the desired behavior in GH#49519
expected = expected.drop(columns="x")
if index_kind == "multi":
expected = expected.drop(columns="x2")
if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
# expected was computed with a RangeIndex; need to translate to index values
values = expected["y"].values.tolist()
if index_kind == "single":
values = [np.nan if e == 4 else e for e in values]
expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
else:
values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
expected["y"] = values
if reduction_func == "size":
# size, unlike other methods, has the desired behavior in GH#49519
expected = expected.rename(columns={0: "size"})
if as_index:
expected = expected["size"].rename(None)
if as_index or index_kind == "range" or reduction_func == "size":
warn = None
else:
warn = FutureWarning
msg = "A grouping .* was excluded from the result"
with tm.assert_produces_warning(warn, match=msg):
result = getattr(gb_keepna, reduction_func)(*args)
# size will return a Series, others are DataFrame
tm.assert_equal(result, expected)
def test_categorical_transformers(
request, transformation_func, observed, sort, as_index
):
# GH#36327
if transformation_func == "fillna":
msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
request.applymarker(pytest.mark.xfail(reason=msg, strict=False))
values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
)
args = get_groupby_method_args(transformation_func, df)
# Compute result for null group
null_group_values = df[df["x"].isnull()]["y"]
if transformation_func == "cumcount":
null_group_data = list(range(len(null_group_values)))
elif transformation_func == "ngroup":
if sort:
if observed:
na_group = df["x"].nunique(dropna=False) - 1
else:
# TODO: Should this be 3?
na_group = df["x"].nunique(dropna=False) - 1
else:
na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
null_group_data = len(null_group_values) * [na_group]
else:
null_group_data = getattr(null_group_values, transformation_func)(*args)
null_group_result = pd.DataFrame({"y": null_group_data})
gb_keepna = df.groupby(
"x", dropna=False, observed=observed, sort=sort, as_index=as_index
)
gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
msg = "The default fill_method='ffill' in DataFrameGroupBy.pct_change is deprecated"
if transformation_func == "pct_change":
with tm.assert_produces_warning(FutureWarning, match=msg):
result = getattr(gb_keepna, "pct_change")(*args)
else:
result = getattr(gb_keepna, transformation_func)(*args)
expected = getattr(gb_dropna, transformation_func)(*args)
for iloc, value in zip(
df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
):
if expected.ndim == 1:
expected.iloc[iloc] = value
else:
expected.iloc[iloc, 0] = value
if transformation_func == "ngroup":
expected[df["x"].notnull() & expected.ge(na_group)] += 1
if transformation_func not in ("rank", "diff", "pct_change", "shift"):
expected = expected.astype("int64")
tm.assert_equal(result, expected)
@pytest.mark.parametrize("method", ["head", "tail"])
def test_categorical_head_tail(method, observed, sort, as_index):
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
result = getattr(gb, method)()
if method == "tail":
values = values[::-1]
# Take the top 5 values from each group
mask = (
((values == 1) & ((values == 1).cumsum() <= 5))
| ((values == 2) & ((values == 2).cumsum() <= 5))
# flake8 doesn't like the vectorized check for None, thinks we should use `is`
| ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
)
if method == "tail":
mask = mask[::-1]
expected = df[mask]
tm.assert_frame_equal(result, expected)
def test_categorical_agg():
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=False)
result = gb.agg(lambda x: x.sum())
expected = gb.sum()
tm.assert_frame_equal(result, expected)
def test_categorical_transform():
# GH#36327
values = np.random.default_rng(2).choice([1, 2, None], 30)
df = pd.DataFrame(
{"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
)
gb = df.groupby("x", dropna=False, observed=False)
result = gb.transform(lambda x: x.sum())
expected = gb.transform("sum")
tm.assert_frame_equal(result, expected)

View File

@ -0,0 +1,135 @@
from datetime import datetime
import numpy as np
import pytest
from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
)
@pytest.mark.parametrize(
"obj",
[
tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
tm.SubclassedSeries(np.arange(0, 10), name="A"),
],
)
def test_groupby_preserves_subclass(obj, groupby_func):
# GH28330 -- preserve subclass through groupby operations
if isinstance(obj, Series) and groupby_func in {"corrwith"}:
pytest.skip(f"Not applicable for Series and {groupby_func}")
grouped = obj.groupby(np.arange(0, 10))
# Groups should preserve subclass type
assert isinstance(grouped.get_group(0), type(obj))
args = get_groupby_method_args(groupby_func, obj)
warn = FutureWarning if groupby_func == "fillna" else None
msg = f"{type(grouped).__name__}.fillna is deprecated"
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
result1 = getattr(grouped, groupby_func)(*args)
with tm.assert_produces_warning(warn, match=msg, raise_on_extra_warnings=False):
result2 = grouped.agg(groupby_func, *args)
# Reduction or transformation kernels should preserve type
slices = {"ngroup", "cumcount", "size"}
if isinstance(obj, DataFrame) and groupby_func in slices:
assert isinstance(result1, tm.SubclassedSeries)
else:
assert isinstance(result1, type(obj))
# Confirm .agg() groupby operations return same results
if isinstance(result1, DataFrame):
tm.assert_frame_equal(result1, result2)
else:
tm.assert_series_equal(result1, result2)
def test_groupby_preserves_metadata():
# GH-37343
custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
assert "testattr" in custom_df._metadata
custom_df.testattr = "hello"
for _, group_df in custom_df.groupby("c"):
assert group_df.testattr == "hello"
# GH-45314
def func(group):
assert isinstance(group, tm.SubclassedDataFrame)
assert hasattr(group, "testattr")
assert group.testattr == "hello"
return group.testattr
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(
DeprecationWarning,
match=msg,
raise_on_extra_warnings=False,
check_stacklevel=False,
):
result = custom_df.groupby("c").apply(func)
expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
tm.assert_series_equal(result, expected)
result = custom_df.groupby("c").apply(func, include_groups=False)
tm.assert_series_equal(result, expected)
# https://github.com/pandas-dev/pandas/pull/56761
result = custom_df.groupby("c")[["a", "b"]].apply(func)
tm.assert_series_equal(result, expected)
def func2(group):
assert isinstance(group, tm.SubclassedSeries)
assert hasattr(group, "testattr")
return group.testattr
custom_series = tm.SubclassedSeries([1, 2, 3])
custom_series.testattr = "hello"
result = custom_series.groupby(custom_df["c"]).apply(func2)
tm.assert_series_equal(result, expected)
result = custom_series.groupby(custom_df["c"]).agg(func2)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
def test_groupby_resample_preserves_subclass(obj):
# GH28330 -- preserve subclass through groupby.resample()
df = obj(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df = df.set_index("Date")
# Confirm groupby.resample() preserves dataframe type
msg = "DataFrameGroupBy.resample operated on the grouping columns"
with tm.assert_produces_warning(
DeprecationWarning,
match=msg,
raise_on_extra_warnings=False,
check_stacklevel=False,
):
result = df.groupby("Buyer").resample("5D").sum()
assert isinstance(result, obj)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,85 @@
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.fixture(params=[["inner"], ["inner", "outer"]])
def frame(request):
levels = request.param
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
if levels:
df = df.set_index(levels)
return df
@pytest.fixture()
def series():
df = pd.DataFrame(
{
"outer": ["a", "a", "a", "b", "b", "b"],
"inner": [1, 2, 3, 1, 2, 3],
"A": np.arange(6),
"B": ["one", "one", "two", "two", "one", "one"],
}
)
s = df.set_index(["outer", "inner", "B"])["A"]
return s
@pytest.mark.parametrize(
"key_strs,groupers",
[
("inner", pd.Grouper(level="inner")), # Index name
(["inner"], [pd.Grouper(level="inner")]), # List of index name
(["B", "inner"], ["B", pd.Grouper(level="inner")]), # Column and index
(["inner", "B"], [pd.Grouper(level="inner"), "B"]), # Index and column
],
)
def test_grouper_index_level_as_string(frame, key_strs, groupers):
if "B" not in key_strs or "outer" in frame.columns:
result = frame.groupby(key_strs).mean(numeric_only=True)
expected = frame.groupby(groupers).mean(numeric_only=True)
else:
result = frame.groupby(key_strs).mean()
expected = frame.groupby(groupers).mean()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"levels",
[
"inner",
"outer",
"B",
["inner"],
["outer"],
["B"],
["inner", "outer"],
["outer", "inner"],
["inner", "outer", "B"],
["B", "outer", "inner"],
],
)
def test_grouper_index_level_as_string_series(series, levels):
# Compute expected result
if isinstance(levels, list):
groupers = [pd.Grouper(level=lv) for lv in levels]
else:
groupers = pd.Grouper(level=levels)
expected = series.groupby(groupers).mean()
# Compute and check result
result = series.groupby(levels).mean()
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,333 @@
# Test GroupBy._positional_selector positional grouped indexing GH#42864
import numpy as np
import pytest
import pandas as pd
import pandas._testing as tm
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[2, [5]],
[5, []],
[-1, [3, 4, 7]],
[-2, [1, 6]],
[-6, []],
],
)
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test single integer
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_slice(slice_test_df, slice_test_grouped):
# Test single slice
result = slice_test_grouped._positional_selector[0:3:2]
expected = slice_test_df.iloc[[0, 1, 4, 5]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[[0, 2], [0, 1, 4, 5]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[range(0, 3, 2), [0, 1, 4, 5]],
[{0, 2}, [0, 1, 4, 5]],
],
ids=[
"list",
"negative",
"range",
"set",
],
)
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
# Test lists of integers and integer valued iterables
result = slice_test_grouped._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_ints(slice_test_df, slice_test_grouped):
# Test tuple of ints
result = slice_test_grouped._positional_selector[0, 2, -1]
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
tm.assert_frame_equal(result, expected)
def test_slices(slice_test_df, slice_test_grouped):
# Test tuple of slices
result = slice_test_grouped._positional_selector[:2, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
def test_mix(slice_test_df, slice_test_grouped):
# Test mixed tuple of ints and slices
result = slice_test_grouped._positional_selector[0, 1, -2:]
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
"arg, expected_rows",
[
[0, [0, 1, 4]],
[[0, 2, -1], [0, 1, 3, 4, 5, 7]],
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
],
)
def test_as_index(slice_test_df, arg, expected_rows):
# Test the default as_index behaviour
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
expected = slice_test_df.iloc[expected_rows]
tm.assert_frame_equal(result, expected)
def test_doc_examples():
# Test the examples in the documentation
df = pd.DataFrame(
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
)
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[1:2]
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
tm.assert_frame_equal(result, expected)
result = grouped._positional_selector[1, -1]
expected = pd.DataFrame(
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
)
tm.assert_frame_equal(result, expected)
@pytest.fixture()
def multiindex_data():
rng = np.random.default_rng(2)
ndates = 100
nitems = 20
dates = pd.date_range("20130101", periods=ndates, freq="D")
items = [f"item {i}" for i in range(nitems)]
data = {}
for date in dates:
nitems_for_date = nitems - rng.integers(0, 12)
levels = [
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
for item in items[:nitems_for_date]
]
levels.sort(key=lambda x: x[1])
data[date] = levels
return data
def _make_df_from_data(data):
rows = {}
for date in data:
for level in data[date]:
rows[(date, level[0])] = {"A": level[1], "B": level[2]}
df = pd.DataFrame.from_dict(rows, orient="index")
df.index.names = ("Date", "Item")
return df
def test_multiindex(multiindex_data):
# Test the multiindex mentioned as the use-case in the documentation
df = _make_df_from_data(multiindex_data)
result = df.groupby("Date", as_index=False).nth(slice(3, -3))
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
expected = _make_df_from_data(sliced)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
@pytest.mark.parametrize("method", ["head", "tail"])
@pytest.mark.parametrize("simulated", [True, False])
def test_against_head_and_tail(arg, method, simulated):
# Test gives the same results as grouped head and tail
n_groups = 100
n_rows_per_group = 30
data = {
"group": [
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
],
"value": [
f"group {g} row {j}"
for j in range(n_rows_per_group)
for g in range(n_groups)
],
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
size = arg if arg >= 0 else n_rows_per_group + arg
if method == "head":
result = grouped._positional_selector[:arg]
if simulated:
indices = [
j * n_groups + i
for j in range(size)
for i in range(n_groups)
if j * n_groups + i < n_groups * n_rows_per_group
]
expected = df.iloc[indices]
else:
expected = grouped.head(arg)
else:
result = grouped._positional_selector[-arg:]
if simulated:
indices = [
(n_rows_per_group + j - size) * n_groups + i
for j in range(size)
for i in range(n_groups)
if (n_rows_per_group + j - size) * n_groups + i >= 0
]
expected = df.iloc[indices]
else:
expected = grouped.tail(arg)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
@pytest.mark.parametrize("step", [None, 1, 5])
def test_against_df_iloc(start, stop, step):
# Test that a single group gives the same results as DataFrame.iloc
n_rows = 30
data = {
"group": ["group 0"] * n_rows,
"value": list(range(n_rows)),
}
df = pd.DataFrame(data)
grouped = df.groupby("group", as_index=False)
result = grouped._positional_selector[start:stop:step]
expected = df.iloc[start:stop:step]
tm.assert_frame_equal(result, expected)
def test_series():
# Test grouped Series
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
grouped = ser.groupby(level=0)
result = grouped._positional_selector[1:2]
expected = pd.Series([2, 5], index=["a", "b"])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
def test_step(step):
# Test slice with various step values
data = [["x", f"x{i}"] for i in range(5)]
data += [["y", f"y{i}"] for i in range(4)]
data += [["z", f"z{i}"] for i in range(3)]
df = pd.DataFrame(data, columns=["A", "B"])
grouped = df.groupby("A", as_index=False)
result = grouped._positional_selector[::step]
data = [["x", f"x{i}"] for i in range(0, 5, step)]
data += [["y", f"y{i}"] for i in range(0, 4, step)]
data += [["z", f"z{i}"] for i in range(0, 3, step)]
index = [0 + i for i in range(0, 5, step)]
index += [5 + i for i in range(0, 4, step)]
index += [9 + i for i in range(0, 3, step)]
expected = pd.DataFrame(data, columns=["A", "B"], index=index)
tm.assert_frame_equal(result, expected)
@pytest.fixture()
def column_group_df():
return pd.DataFrame(
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
columns=["A", "B", "C", "D", "E", "F", "G"],
)
def test_column_axis(column_group_df):
msg = "DataFrame.groupby with axis=1"
with tm.assert_produces_warning(FutureWarning, match=msg):
g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
result = g._positional_selector[1:-1]
expected = column_group_df.iloc[:, [1, 3]]
tm.assert_frame_equal(result, expected)
def test_columns_on_iter():
# GitHub issue #44821
df = pd.DataFrame({k: range(10) for k in "ABC"})
# Group-by and select columns
cols = ["A", "B"]
for _, dg in df.groupby(df.A < 4)[cols]:
tm.assert_index_equal(dg.columns, pd.Index(cols))
assert "C" not in dg.columns
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
def test_groupby_duplicated_columns(func):
# GH#44924
df = pd.DataFrame(
{
"A": [1, 2],
"B": [3, 3],
"C": ["G", "G"],
}
)
result = df.groupby("C")[func(["A", "B", "A"])].mean()
expected = pd.DataFrame(
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
)
tm.assert_frame_equal(result, expected)
def test_groupby_get_nonexisting_groups():
# GH#32492
df = pd.DataFrame(
data={
"A": ["a1", "a2", None],
"B": ["b1", "b2", "b1"],
"val": [1, 2, 3],
}
)
grps = df.groupby(by=["A", "B"])
msg = "('a2', 'b1')"
with pytest.raises(KeyError, match=msg):
grps.get_group(("a2", "b1"))

View File

@ -0,0 +1,331 @@
import numpy as np
import pytest
from pandas._libs import groupby as libgroupby
from pandas._libs.groupby import (
group_cumprod,
group_cumsum,
group_mean,
group_sum,
group_var,
)
from pandas.core.dtypes.common import ensure_platform_int
from pandas import isna
import pandas._testing as tm
class GroupVarTestMixin:
def test_group_var_generic_1d(self):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 1))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((15, 1)).astype(self.dtype)
labels = np.tile(np.arange(5), (3,)).astype("intp")
expected_out = (
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
)[:, np.newaxis]
expected_counts = counts + 3
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_1d_flat_labels(self):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((1, 1))).astype(self.dtype)
counts = np.zeros(1, dtype="int64")
values = 10 * prng.random((5, 1)).astype(self.dtype)
labels = np.zeros(5, dtype="intp")
expected_out = np.array([[values.std(ddof=1) ** 2]])
expected_counts = counts + 5
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_all_finite(self):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((10, 2)).astype(self.dtype)
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
expected_counts = counts + 2
self.algo(out, counts, values, labels)
assert np.allclose(out, expected_out, self.rtol)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_generic_2d_some_nan(self):
prng = np.random.default_rng(2)
out = (np.nan * np.ones((5, 2))).astype(self.dtype)
counts = np.zeros(5, dtype="int64")
values = 10 * prng.random((10, 2)).astype(self.dtype)
values[:, 1] = np.nan
labels = np.tile(np.arange(5), (2,)).astype("intp")
expected_out = np.vstack(
[
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
np.nan * np.ones(5),
]
).T.astype(self.dtype)
expected_counts = counts + 2
self.algo(out, counts, values, labels)
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
tm.assert_numpy_array_equal(counts, expected_counts)
def test_group_var_constant(self):
# Regression test from GH 10448.
out = np.array([[np.nan]], dtype=self.dtype)
counts = np.array([0], dtype="int64")
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype)
labels = np.zeros(3, dtype="intp")
self.algo(out, counts, values, labels)
assert counts[0] == 3
assert out[0, 0] >= 0
tm.assert_almost_equal(out[0, 0], 0.0)
class TestGroupVarFloat64(GroupVarTestMixin):
__test__ = True
algo = staticmethod(group_var)
dtype = np.float64
rtol = 1e-5
def test_group_var_large_inputs(self):
prng = np.random.default_rng(2)
out = np.array([[np.nan]], dtype=self.dtype)
counts = np.array([0], dtype="int64")
values = (prng.random(10**6) + 10**12).astype(self.dtype)
values.shape = (10**6, 1)
labels = np.zeros(10**6, dtype="intp")
self.algo(out, counts, values, labels)
assert counts[0] == 10**6
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
class TestGroupVarFloat32(GroupVarTestMixin):
__test__ = True
algo = staticmethod(group_var)
dtype = np.float32
rtol = 1e-2
@pytest.mark.parametrize("dtype", ["float32", "float64"])
def test_group_ohlc(dtype):
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
bins = np.array([6, 12, 20])
out = np.zeros((3, 4), dtype)
counts = np.zeros(len(out), dtype=np.int64)
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
func = libgroupby.group_ohlc
func(out, counts, obj[:, None], labels)
def _ohlc(group):
if isna(group).all():
return np.repeat(np.nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
tm.assert_almost_equal(out, expected)
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
obj[:6] = np.nan
func(out, counts, obj[:, None], labels)
expected[0] = np.nan
tm.assert_almost_equal(out, expected)
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
"""
Check a group transform that executes a cumulative function.
Parameters
----------
pd_op : callable
The pandas cumulative function.
np_op : callable
The analogous one in NumPy.
dtype : type
The specified dtype of the data.
"""
is_datetimelike = False
data = np.array([[1], [2], [3], [4]], dtype=dtype)
answer = np.zeros_like(data)
labels = np.array([0, 0, 0, 0], dtype=np.intp)
ngroups = 1
pd_op(answer, data, labels, ngroups, is_datetimelike)
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
@pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"])
def test_cython_group_transform_cumsum(np_dtype):
# see gh-4095
dtype = np.dtype(np_dtype).type
pd_op, np_op = group_cumsum, np.cumsum
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
def test_cython_group_transform_cumprod():
# see gh-4095
dtype = np.float64
pd_op, np_op = group_cumprod, np.cumprod
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
def test_cython_group_transform_algos():
# see gh-4095
is_datetimelike = False
# with nans
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
ngroups = 1
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumprod(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
actual = np.zeros_like(data)
actual.fill(np.nan)
group_cumsum(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
# timedelta
is_datetimelike = True
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
actual = np.zeros_like(data, dtype="int64")
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
expected = np.array(
[
np.timedelta64(1, "ns"),
np.timedelta64(2, "ns"),
np.timedelta64(3, "ns"),
np.timedelta64(4, "ns"),
np.timedelta64(5, "ns"),
]
)
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
def test_cython_group_mean_datetimelike():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=True)
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
def test_cython_group_mean_wrong_min_count():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.zeros(1, dtype="int64")
data = np.zeros(1, dtype="float64")[:, None]
labels = np.zeros(1, dtype=np.intp)
with pytest.raises(AssertionError, match="min_count"):
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
actual = np.zeros(shape=(1, 1), dtype="float64")
counts = np.array([0], dtype="int64")
data = (
np.array(
[np.timedelta64("NaT"), np.timedelta64("NaT")],
dtype="m8[ns]",
)[:, None]
.view("int64")
.astype("float64")
)
labels = np.zeros(len(data), dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=False)
tm.assert_numpy_array_equal(
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
)
def test_cython_group_mean_Inf_at_begining_and_end():
# GH 50367
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
counts = np.array([0, 0], dtype="int64")
data = np.array(
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
dtype="float64",
)
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
group_mean(actual, counts, data, labels, is_datetimelike=False)
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
tm.assert_numpy_array_equal(
actual,
expected,
)
@pytest.mark.parametrize(
"values, out",
[
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
],
)
def test_cython_group_sum_Inf_at_begining_and_end(values, out):
# GH #53606
actual = np.array([[np.nan], [np.nan]], dtype="float64")
counts = np.array([0, 0], dtype="int64")
data = np.array(values, dtype="float64")
labels = np.array([0, 1, 1], dtype=np.intp)
group_sum(actual, counts, data, labels, None, is_datetimelike=False)
expected = np.array(out, dtype="float64")
tm.assert_numpy_array_equal(
actual,
expected,
)

View File

@ -0,0 +1,163 @@
import numpy as np
import pytest
import pandas as pd
from pandas import (
DataFrame,
Index,
date_range,
)
import pandas._testing as tm
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_column_index_name_lost_fill_funcs(func):
# GH: 29764 groupby loses index sometimes
df = DataFrame(
[[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
columns=Index(["type", "a", "b"], name="idx"),
)
df_grouped = df.groupby(["type"])[["a", "b"]]
result = getattr(df_grouped, func)().columns
expected = Index(["a", "b"], name="idx")
tm.assert_index_equal(result, expected)
@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_fill_duplicate_column_names(func):
# GH: 25610 ValueError with duplicate column names
df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
df2 = DataFrame({"field1": [1, np.nan, 4]})
df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
expected = DataFrame(
[[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
)
result = getattr(df_grouped, func)()
tm.assert_frame_equal(result, expected)
def test_ffill_missing_arguments():
# GH 14955
df = DataFrame({"a": [1, 2], "b": [1, 1]})
msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
with pytest.raises(ValueError, match="Must specify a fill"):
df.groupby("b").fillna()
@pytest.mark.parametrize(
"method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
)
def test_fillna_with_string_dtype(method, expected):
# GH 40250
df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
grp = df.groupby("b")
msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = grp.fillna(method=method)
expected = DataFrame({"a": pd.array(expected, dtype="string")})
tm.assert_frame_equal(result, expected)
def test_fill_consistency():
# GH9221
# pass thru keyword arguments to the generated wrapper
# are set if the passed kw is None (only)
df = DataFrame(
index=pd.MultiIndex.from_product(
[["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
),
columns=Index(["1", "2"], name="id"),
)
df["1"] = [
np.nan,
1,
np.nan,
np.nan,
11,
np.nan,
np.nan,
2,
np.nan,
np.nan,
22,
np.nan,
]
df["2"] = [
np.nan,
3,
np.nan,
np.nan,
33,
np.nan,
np.nan,
4,
np.nan,
np.nan,
44,
np.nan,
]
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = df.groupby(level=0, axis=0).fillna(method="ffill")
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("method", ["ffill", "bfill"])
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("has_nan_group", [True, False])
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
# GH 34725
df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
ridx = [-1, 0, -1, -1, 1, -1]
df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
group_b = np.nan if has_nan_group else "b"
df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
grouped = df.groupby(by="group_col", dropna=dropna)
result = getattr(grouped, method)(limit=None)
expected_rows = {
("ffill", True, True): [-1, 0, 0, -1, -1, -1],
("ffill", True, False): [-1, 0, 0, -1, 1, 1],
("ffill", False, True): [-1, 0, 0, -1, 1, 1],
("ffill", False, False): [-1, 0, 0, -1, 1, 1],
("bfill", True, True): [0, 0, -1, -1, -1, -1],
("bfill", True, False): [0, 0, -1, 1, 1, -1],
("bfill", False, True): [0, 0, -1, 1, 1, -1],
("bfill", False, False): [0, 0, -1, 1, 1, -1],
}
ridx = expected_rows.get((method, dropna, has_nan_group))
expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
# columns are a 'take' on df.columns, which are object dtype
expected.columns = expected.columns.astype(object)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count(func, min_count, value):
# GH#37821
df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
result = getattr(df.groupby("a"), func)(min_count=min_count)
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
tm.assert_frame_equal(result, expected)
def test_indices_with_missing():
# GH 9304
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
g = df.groupby(["a", "b"])
result = g.indices
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
assert result == expected

View File

@ -0,0 +1,80 @@
import pytest
from pandas import (
DataFrame,
Series,
option_context,
)
import pandas._testing as tm
pytestmark = pytest.mark.single_cpu
pytest.importorskip("numba")
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
class TestEngine:
def test_cython_vs_numba_frame(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_frame_equal(result, expected)
def test_cython_vs_numba_getitem(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = df.groupby("a", sort=sort)["c"]
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
def test_cython_vs_numba_series(
self, sort, nogil, parallel, nopython, numba_supported_reductions
):
func, kwargs = numba_supported_reductions
ser = Series(range(3), index=[1, 2, 1], name="foo")
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
gb = ser.groupby(level=0, sort=sort)
result = getattr(gb, func)(
engine="numba", engine_kwargs=engine_kwargs, **kwargs
)
expected = getattr(gb, func)(**kwargs)
tm.assert_series_equal(result, expected)
def test_as_index_false_unsupported(self, numba_supported_reductions):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a", as_index=False)
with pytest.raises(NotImplementedError, match="as_index=False"):
getattr(gb, func)(engine="numba", **kwargs)
def test_axis_1_unsupported(self, numba_supported_reductions):
func, kwargs = numba_supported_reductions
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a", axis=1)
with pytest.raises(NotImplementedError, match="axis=1"):
getattr(gb, func)(engine="numba", **kwargs)
def test_no_engine_doesnt_raise(self):
# GH55520
df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
gb = df.groupby("a")
# Make sure behavior of functions w/out engine argument don't raise
# when the global use_numba option is set
with option_context("compute.use_numba", True):
res = gb.agg({"b": "first"})
expected = gb.agg({"b": "first"})
tm.assert_frame_equal(res, expected)

View File

@ -0,0 +1,521 @@
import re
import numpy as np
import pytest
from pandas._libs import lib
import pandas as pd
from pandas import (
DataFrame,
Index,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
class TestNumericOnly:
# make sure that we are passing thru kwargs to our agg functions
@pytest.fixture
def df(self):
# GH3668
# GH5724
df = DataFrame(
{
"group": [1, 1, 2],
"int": [1, 2, 3],
"float": [4.0, 5.0, 6.0],
"string": list("abc"),
"category_string": Series(list("abc")).astype("category"),
"category_int": [7, 8, 9],
"datetime": date_range("20130101", periods=3),
"datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
"timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
},
columns=[
"group",
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
],
)
return df
@pytest.mark.parametrize("method", ["mean", "median"])
def test_averages(self, df, method):
# mean / median
expected_columns_numeric = Index(["int", "float", "category_int"])
gb = df.groupby("group")
expected = DataFrame(
{
"category_int": [7.5, 9],
"float": [4.5, 6.0],
"timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
"int": [1.5, 3],
"datetime": [
Timestamp("2013-01-01 12:00:00"),
Timestamp("2013-01-03 00:00:00"),
],
"datetimetz": [
Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
],
},
index=Index([1, 2], name="group"),
columns=[
"int",
"float",
"category_int",
],
)
result = getattr(gb, method)(numeric_only=True)
tm.assert_frame_equal(result.reindex_like(expected), expected)
expected_columns = expected.columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["min", "max"])
def test_extrema(self, df, method):
# TODO: min, max *should* handle
# categorical (ordered) dtype
expected_columns = Index(
[
"int",
"float",
"string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["first", "last"])
def test_first_last(self, df, method):
expected_columns = Index(
[
"int",
"float",
"string",
"category_string",
"category_int",
"datetime",
"datetimetz",
"timedelta",
]
)
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["sum", "cumsum"])
def test_sum_cumsum(self, df, method):
expected_columns_numeric = Index(["int", "float", "category_int"])
expected_columns = Index(
["int", "float", "string", "category_int", "timedelta"]
)
if method == "cumsum":
# cumsum loses string
expected_columns = Index(["int", "float", "category_int", "timedelta"])
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["prod", "cumprod"])
def test_prod_cumprod(self, df, method):
expected_columns = Index(["int", "float", "category_int"])
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_cummax(self, df, method):
# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
# GH#15561: numeric_only=False set by default like min/max
expected_columns_numeric = expected_columns
self._check(df, method, expected_columns, expected_columns_numeric)
def _check(self, df, method, expected_columns, expected_columns_numeric):
gb = df.groupby("group")
# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if method.startswith("cum") else TypeError
if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
# The methods default to numeric_only=False and raise TypeError
msg = "|".join(
[
"Categorical is not ordered",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
# cumsum/cummin/cummax/cumprod
"function is not implemented for this dtype",
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
elif method in ("sum", "mean", "median", "prod"):
msg = "|".join(
[
"category type does not support sum operations",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)()
else:
result = getattr(gb, method)()
tm.assert_index_equal(result.columns, expected_columns_numeric)
if method not in ("first", "last"):
msg = "|".join(
[
"Categorical is not ordered",
"category type does not support",
"function is not implemented for this dtype",
f"Cannot perform {method} with non-ordered Categorical",
re.escape(f"agg function failed [how->{method},dtype->object]"),
re.escape(f"agg function failed [how->{method},dtype->string]"),
]
)
with pytest.raises(exception, match=msg):
getattr(gb, method)(numeric_only=False)
else:
result = getattr(gb, method)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
@pytest.mark.parametrize("numeric_only", [True, False, None])
def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string):
if groupby_func in ("idxmax", "idxmin"):
pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
if groupby_func in ("corrwith", "skew"):
msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
request.applymarker(pytest.mark.xfail(reason=msg))
df = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)), columns=["A", "B", "C", "D"]
)
df["E"] = "x"
groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
gb = df.groupby(groups)
method = getattr(gb, groupby_func)
args = get_groupby_method_args(groupby_func, df)
kwargs = {"axis": 1}
if numeric_only is not None:
# when numeric_only is None we don't pass any argument
kwargs["numeric_only"] = numeric_only
# Functions without numeric_only and axis args
no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
# Functions with axis args
has_axis = (
"cumprod",
"cumsum",
"diff",
"pct_change",
"rank",
"shift",
"cummax",
"cummin",
"idxmin",
"idxmax",
"fillna",
)
warn_msg = f"DataFrameGroupBy.{groupby_func} with axis=1 is deprecated"
if numeric_only is not None and groupby_func in no_args:
msg = "got an unexpected keyword argument 'numeric_only'"
if groupby_func in ["cumprod", "cumsum"]:
with pytest.raises(TypeError, match=msg):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
method(*args, **kwargs)
else:
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
elif groupby_func not in has_axis:
msg = "got an unexpected keyword argument 'axis'"
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
# fillna and shift are successful even on object dtypes
elif (numeric_only is None or not numeric_only) and groupby_func not in (
"fillna",
"shift",
):
msgs = (
# cummax, cummin, rank
"not supported between instances of",
# cumprod
"can't multiply sequence by non-int of type 'float'",
# cumsum, diff, pct_change
"unsupported operand type",
"has no kernel",
)
if using_infer_string:
import pyarrow as pa
errs = (TypeError, pa.lib.ArrowNotImplementedError)
else:
errs = TypeError
with pytest.raises(errs, match=f"({'|'.join(msgs)})"):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
method(*args, **kwargs)
else:
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
result = method(*args, **kwargs)
df_expected = df.drop(columns="E").T if numeric_only else df.T
expected = getattr(df_expected, groupby_func)(*args).T
if groupby_func == "shift" and not numeric_only:
# shift with axis=1 leaves the leftmost column as numeric
# but transposing for expected gives us object dtype
expected = expected.astype(float)
tm.assert_equal(result, expected)
@pytest.mark.parametrize(
"kernel, has_arg",
[
("all", False),
("any", False),
("bfill", False),
("corr", True),
("corrwith", True),
("cov", True),
("cummax", True),
("cummin", True),
("cumprod", True),
("cumsum", True),
("diff", False),
("ffill", False),
("fillna", False),
("first", True),
("idxmax", True),
("idxmin", True),
("last", True),
("max", True),
("mean", True),
("median", True),
("min", True),
("nth", False),
("nunique", False),
("pct_change", False),
("prod", True),
("quantile", True),
("sem", True),
("skew", True),
("std", True),
("sum", True),
("var", True),
],
)
@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_numeric_only(kernel, has_arg, numeric_only, keys):
# GH#46072
# drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
# has_arg: Whether the op has a numeric_only arg
df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
args = get_groupby_method_args(kernel, df)
kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
gb = df.groupby(keys)
method = getattr(gb, kernel)
if has_arg and numeric_only is True:
# Cases where b does not appear in the result
result = method(*args, **kwargs)
assert "b" not in result.columns
elif (
# kernels that work on any dtype and have numeric_only arg
kernel in ("first", "last")
or (
# kernels that work on any dtype and don't have numeric_only arg
kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
and numeric_only is lib.no_default
)
):
warn = FutureWarning if kernel == "fillna" else None
msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = method(*args, **kwargs)
assert "b" in result.columns
elif has_arg:
assert numeric_only is not True
# kernels that are successful on any dtype were above; this will fail
# object dtypes for transformations are not implemented in Cython and
# have no Python fallback
exception = NotImplementedError if kernel.startswith("cum") else TypeError
msg = "|".join(
[
"not allowed for this dtype",
"cannot be performed against 'object' dtypes",
# On PY39 message is "a number"; on PY310 and after is "a real number"
"must be a string or a.* number",
"unsupported operand type",
"function is not implemented for this dtype",
re.escape(f"agg function failed [how->{kernel},dtype->object]"),
]
)
if kernel == "idxmin":
msg = "'<' not supported between instances of 'type' and 'type'"
elif kernel == "idxmax":
msg = "'>' not supported between instances of 'type' and 'type'"
with pytest.raises(exception, match=msg):
method(*args, **kwargs)
elif not has_arg and numeric_only is not lib.no_default:
with pytest.raises(
TypeError, match="got an unexpected keyword argument 'numeric_only'"
):
method(*args, **kwargs)
else:
assert kernel in ("diff", "pct_change")
assert numeric_only is lib.no_default
# Doesn't have numeric_only argument and fails on nuisance columns
with pytest.raises(TypeError, match=r"unsupported operand type"):
method(*args, **kwargs)
@pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning")
@pytest.mark.parametrize("dtype", [bool, int, float, object])
def test_deprecate_numeric_only_series(dtype, groupby_func, request):
# GH#46560
grouper = [0, 0, 1]
ser = Series([1, 0, 0], dtype=dtype)
gb = ser.groupby(grouper)
if groupby_func == "corrwith":
# corrwith is not implemented on SeriesGroupBy
assert not hasattr(gb, groupby_func)
return
method = getattr(gb, groupby_func)
expected_ser = Series([1, 0, 0])
expected_gb = expected_ser.groupby(grouper)
expected_method = getattr(expected_gb, groupby_func)
args = get_groupby_method_args(groupby_func, ser)
fails_on_numeric_object = (
"corr",
"cov",
"cummax",
"cummin",
"cumprod",
"cumsum",
"quantile",
)
# ops that give an object result on object input
obj_result = (
"first",
"last",
"nth",
"bfill",
"ffill",
"shift",
"sum",
"diff",
"pct_change",
"var",
"mean",
"median",
"min",
"max",
"prod",
"skew",
)
# Test default behavior; kernels that fail may be enabled in the future but kernels
# that succeed should not be allowed to fail (without deprecation, at least)
if groupby_func in fails_on_numeric_object and dtype is object:
if groupby_func == "quantile":
msg = "cannot be performed against 'object' dtypes"
else:
msg = "is not supported for object dtype"
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "DataFrameGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
with pytest.raises(TypeError, match=msg):
method(*args)
elif dtype is object:
warn = FutureWarning if groupby_func == "fillna" else None
warn_msg = "SeriesGroupBy.fillna is deprecated"
with tm.assert_produces_warning(warn, match=warn_msg):
result = method(*args)
with tm.assert_produces_warning(warn, match=warn_msg):
expected = expected_method(*args)
if groupby_func in obj_result:
expected = expected.astype(object)
tm.assert_series_equal(result, expected)
has_numeric_only = (
"first",
"last",
"max",
"mean",
"median",
"min",
"prod",
"quantile",
"sem",
"skew",
"std",
"sum",
"var",
"cummax",
"cummin",
"cumprod",
"cumsum",
)
if groupby_func not in has_numeric_only:
msg = "got an unexpected keyword argument 'numeric_only'"
with pytest.raises(TypeError, match=msg):
method(*args, numeric_only=True)
elif dtype is object:
msg = "|".join(
[
"SeriesGroupBy.sem called with numeric_only=True and dtype object",
"Series.skew does not allow numeric_only=True with non-numeric",
"cum(sum|prod|min|max) is not supported for object dtype",
r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
]
)
with pytest.raises(TypeError, match=msg):
method(*args, numeric_only=True)
elif dtype == bool and groupby_func == "quantile":
msg = "Allowing bool dtype in SeriesGroupBy.quantile"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#51424
result = method(*args, numeric_only=True)
expected = method(*args, numeric_only=False)
tm.assert_series_equal(result, expected)
else:
result = method(*args, numeric_only=True)
expected = method(*args, numeric_only=False)
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,80 @@
import numpy as np
import pandas as pd
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
def test_pipe():
# Test the pipe method of DataFrameGroupBy.
# Issue #17871
random_state = np.random.default_rng(2)
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": random_state.standard_normal(8),
"C": random_state.standard_normal(8),
}
)
def f(dfgb):
return dfgb.B.max() - dfgb.C.min().min()
def square(srs):
return srs**2
# Note that the transformations are
# GroupBy -> Series
# Series -> Series
# This then chains the GroupBy.pipe and the
# NDFrame.pipe methods
result = df.groupby("A").pipe(f).pipe(square)
index = Index(["bar", "foo"], dtype="object", name="A")
expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
tm.assert_series_equal(expected, result)
def test_pipe_args():
# Test passing args to the pipe method of DataFrameGroupBy.
# Issue #17871
df = DataFrame(
{
"group": ["A", "A", "B", "B", "C"],
"x": [1.0, 2.0, 3.0, 2.0, 5.0],
"y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
}
)
def f(dfgb, arg1):
filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
return filtered.groupby("group")
def g(dfgb, arg2):
return dfgb.sum() / dfgb.sum().sum() + arg2
def h(df, arg3):
return df.x + df.y - arg3
result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
# Assert the results here
index = Index(["A", "B"], name="group")
expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
tm.assert_series_equal(result, expected)
# test SeriesGroupby.pipe
ser = pd.Series([1, 1, 2, 2, 3, 3])
result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
tm.assert_series_equal(result, expected)

View File

@ -0,0 +1,716 @@
# Only tests that raise an error and have no better location should go here.
# Tests for specific groupby methods should go in their respective
# test file.
import datetime
import re
import numpy as np
import pytest
from pandas import (
Categorical,
DataFrame,
Grouper,
Series,
)
import pandas._testing as tm
from pandas.tests.groupby import get_groupby_method_args
@pytest.fixture(
params=[
"a",
["a"],
["a", "b"],
Grouper(key="a"),
lambda x: x % 2,
[0, 0, 0, 1, 2, 2, 2, 3, 3],
np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3])),
Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
[Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
]
)
def by(request):
return request.param
@pytest.fixture(params=[True, False])
def groupby_series(request):
return request.param
@pytest.fixture
def df_with_string_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": list("xyzwtyuio"),
}
)
return df
@pytest.fixture
def df_with_datetime_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
}
)
return df
@pytest.fixture
def df_with_timedelta_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": datetime.timedelta(days=1),
}
)
return df
@pytest.fixture
def df_with_cat_col():
df = DataFrame(
{
"a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
"b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
"c": range(9),
"d": Categorical(
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
),
}
)
return df
def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):
warn_klass = None if warn_msg == "" else FutureWarning
with tm.assert_produces_warning(warn_klass, match=warn_msg):
if klass is None:
if how == "method":
getattr(gb, groupby_func)(*args)
elif how == "agg":
gb.agg(groupby_func, *args)
else:
gb.transform(groupby_func, *args)
else:
with pytest.raises(klass, match=msg):
if how == "method":
getattr(gb, groupby_func)(*args)
elif how == "agg":
gb.agg(groupby_func, *args)
else:
gb.transform(groupby_func, *args)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_string(
how, by, groupby_series, groupby_func, df_with_string_col
):
df = df_with_string_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (TypeError, "Could not convert"),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(function|cummax) is not (implemented|supported) for (this|object) dtype",
),
"cummin": (
(NotImplementedError, TypeError),
"(function|cummin) is not (implemented|supported) for (this|object) dtype",
),
"cumprod": (
(NotImplementedError, TypeError),
"(function|cumprod) is not (implemented|supported) for (this|object) dtype",
),
"cumsum": (
(NotImplementedError, TypeError),
"(function|cumsum) is not (implemented|supported) for (this|object) dtype",
),
"diff": (TypeError, "unsupported operand type"),
"ffill": (None, ""),
"fillna": (None, ""),
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (
TypeError,
re.escape("agg function failed [how->mean,dtype->object]"),
),
"median": (
TypeError,
re.escape("agg function failed [how->median,dtype->object]"),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (
TypeError,
re.escape("agg function failed [how->prod,dtype->object]"),
),
"quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
"rank": (None, ""),
"sem": (ValueError, "could not convert string to float"),
"shift": (None, ""),
"size": (None, ""),
"skew": (ValueError, "could not convert string to float"),
"std": (ValueError, "could not convert string to float"),
"sum": (None, ""),
"var": (
TypeError,
re.escape("agg function failed [how->var,dtype->"),
),
}[groupby_func]
if groupby_func == "fillna":
kind = "Series" if groupby_series else "DataFrame"
warn_msg = f"{kind}GroupBy.fillna is deprecated"
else:
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
df = df_with_string_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_string_np(
how, by, groupby_series, groupby_func_np, df_with_string_col
):
# GH#50749
df = df_with_string_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (None, ""),
np.mean: (
TypeError,
re.escape("agg function failed [how->mean,dtype->object]"),
),
}[groupby_func_np]
if groupby_series:
warn_msg = "using SeriesGroupBy.[sum|mean]"
else:
warn_msg = "using DataFrameGroupBy.[sum|mean]"
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_datetime(
how, by, groupby_series, groupby_func, df_with_datetime_col
):
df = df_with_datetime_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (TypeError, "cannot perform __mul__ with this index type"),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (None, ""),
"cummin": (None, ""),
"cumprod": (TypeError, "datetime64 type does not support cumprod operations"),
"cumsum": (TypeError, "datetime64 type does not support cumsum operations"),
"diff": (None, ""),
"ffill": (None, ""),
"fillna": (None, ""),
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (None, ""),
"median": (None, ""),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
"prod": (TypeError, "datetime64 type does not support prod"),
"quantile": (None, ""),
"rank": (None, ""),
"sem": (None, ""),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
r"dtype datetime64\[ns\] does not support reduction",
"datetime64 type does not support skew operations",
]
),
),
"std": (None, ""),
"sum": (TypeError, "datetime64 type does not support sum operations"),
"var": (TypeError, "datetime64 type does not support var operations"),
}[groupby_func]
if groupby_func in ["any", "all"]:
warn_msg = f"'{groupby_func}' with datetime64 dtypes is deprecated"
elif groupby_func == "fillna":
kind = "Series" if groupby_series else "DataFrame"
warn_msg = f"{kind}GroupBy.fillna is deprecated"
else:
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
df = df_with_datetime_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_datetime_np(
how, by, groupby_series, groupby_func_np, df_with_datetime_col
):
# GH#50749
df = df_with_datetime_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (TypeError, "datetime64 type does not support sum operations"),
np.mean: (None, ""),
}[groupby_func_np]
if groupby_series:
warn_msg = "using SeriesGroupBy.[sum|mean]"
else:
warn_msg = "using DataFrameGroupBy.[sum|mean]"
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "var"])
def test_groupby_raises_timedelta(func, df_with_timedelta_col):
df = df_with_timedelta_col
gb = df.groupby(by="a")
_call_and_check(
TypeError,
"timedelta64 type does not support .* operations",
"method",
gb,
func,
[],
)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category(
how, by, groupby_series, groupby_func, using_copy_on_write, df_with_cat_col
):
# GH#50749
df = df_with_cat_col
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (
TypeError,
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(category type does not support cummax operations|"
"category dtype not supported|"
"cummax is not supported for category dtype)",
),
"cummin": (
(NotImplementedError, TypeError),
"(category type does not support cummin operations|"
"category dtype not supported|"
"cummin is not supported for category dtype)",
),
"cumprod": (
(NotImplementedError, TypeError),
"(category type does not support cumprod operations|"
"category dtype not supported|"
"cumprod is not supported for category dtype)",
),
"cumsum": (
(NotImplementedError, TypeError),
"(category type does not support cumsum operations|"
"category dtype not supported|"
"cumsum is not supported for category dtype)",
),
"diff": (
TypeError,
r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
),
"ffill": (None, ""),
"fillna": (
TypeError,
r"Cannot setitem on a Categorical with a new category \(0\), "
"set the categories first",
)
if not using_copy_on_write
else (None, ""), # no-op with CoW
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'mean'",
"category dtype does not support aggregation 'mean'",
]
),
),
"median": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'median'",
"category dtype does not support aggregation 'median'",
]
),
),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (
TypeError,
r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
),
"prod": (TypeError, "category type does not support prod operations"),
"quantile": (TypeError, "No matching signature found"),
"rank": (None, ""),
"sem": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'sem'",
"category dtype does not support aggregation 'sem'",
]
),
),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
"dtype category does not support reduction 'skew'",
"category type does not support skew operations",
]
),
),
"std": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'std'",
"category dtype does not support aggregation 'std'",
]
),
),
"sum": (TypeError, "category type does not support sum operations"),
"var": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'var'",
"category dtype does not support aggregation 'var'",
]
),
),
}[groupby_func]
if groupby_func == "fillna":
kind = "Series" if groupby_series else "DataFrame"
warn_msg = f"{kind}GroupBy.fillna is deprecated"
else:
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
@pytest.mark.parametrize("how", ["agg", "transform"])
def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
# GH#50749
df = df_with_cat_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
def func(x):
raise TypeError("Test error message")
with pytest.raises(TypeError, match="Test error message"):
getattr(gb, how)(func)
@pytest.mark.parametrize("how", ["agg", "transform"])
@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
def test_groupby_raises_category_np(
how, by, groupby_series, groupby_func_np, df_with_cat_col
):
# GH#50749
df = df_with_cat_col
gb = df.groupby(by=by)
if groupby_series:
gb = gb["d"]
klass, msg = {
np.sum: (TypeError, "category type does not support sum operations"),
np.mean: (
TypeError,
"category dtype does not support aggregation 'mean'",
),
}[groupby_func_np]
if groupby_series:
warn_msg = "using SeriesGroupBy.[sum|mean]"
else:
warn_msg = "using DataFrameGroupBy.[sum|mean]"
_call_and_check(klass, msg, how, gb, groupby_func_np, (), warn_msg=warn_msg)
@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category_on_category(
how,
by,
groupby_series,
groupby_func,
observed,
using_copy_on_write,
df_with_cat_col,
):
# GH#50749
df = df_with_cat_col
df["a"] = Categorical(
["a", "a", "a", "a", "b", "b", "b", "b", "c"],
categories=["a", "b", "c", "d"],
ordered=True,
)
args = get_groupby_method_args(groupby_func, df)
gb = df.groupby(by=by, observed=observed)
if groupby_series:
gb = gb["d"]
if groupby_func == "corrwith":
assert not hasattr(gb, "corrwith")
return
empty_groups = not observed and any(group.empty for group in gb.groups.values())
if (
not observed
and how != "transform"
and isinstance(by, list)
and isinstance(by[0], str)
and by == ["a", "b"]
):
assert not empty_groups
# TODO: empty_groups should be true due to unobserved categorical combinations
empty_groups = True
if how == "transform":
# empty groups will be ignored
empty_groups = False
klass, msg = {
"all": (None, ""),
"any": (None, ""),
"bfill": (None, ""),
"corrwith": (
TypeError,
r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
),
"count": (None, ""),
"cumcount": (None, ""),
"cummax": (
(NotImplementedError, TypeError),
"(cummax is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cummax operations)",
),
"cummin": (
(NotImplementedError, TypeError),
"(cummin is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cummin operations)",
),
"cumprod": (
(NotImplementedError, TypeError),
"(cumprod is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cumprod operations)",
),
"cumsum": (
(NotImplementedError, TypeError),
"(cumsum is not supported for category dtype|"
"category dtype not supported|"
"category type does not support cumsum operations)",
),
"diff": (TypeError, "unsupported operand type"),
"ffill": (None, ""),
"fillna": (
TypeError,
r"Cannot setitem on a Categorical with a new category \(0\), "
"set the categories first",
)
if not using_copy_on_write
else (None, ""), # no-op with CoW
"first": (None, ""),
"idxmax": (ValueError, "empty group due to unobserved categories")
if empty_groups
else (None, ""),
"idxmin": (ValueError, "empty group due to unobserved categories")
if empty_groups
else (None, ""),
"last": (None, ""),
"max": (None, ""),
"mean": (TypeError, "category dtype does not support aggregation 'mean'"),
"median": (TypeError, "category dtype does not support aggregation 'median'"),
"min": (None, ""),
"ngroup": (None, ""),
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (TypeError, "category type does not support prod operations"),
"quantile": (TypeError, ""),
"rank": (None, ""),
"sem": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'sem'",
"category dtype does not support aggregation 'sem'",
]
),
),
"shift": (None, ""),
"size": (None, ""),
"skew": (
TypeError,
"|".join(
[
"category type does not support skew operations",
"dtype category does not support reduction 'skew'",
]
),
),
"std": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'std'",
"category dtype does not support aggregation 'std'",
]
),
),
"sum": (TypeError, "category type does not support sum operations"),
"var": (
TypeError,
"|".join(
[
"'Categorical' .* does not support reduction 'var'",
"category dtype does not support aggregation 'var'",
]
),
),
}[groupby_func]
if groupby_func == "fillna":
kind = "Series" if groupby_series else "DataFrame"
warn_msg = f"{kind}GroupBy.fillna is deprecated"
else:
warn_msg = ""
_call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg)
def test_subsetting_columns_axis_1_raises():
# GH 35443
df = DataFrame({"a": [1], "b": [2], "c": [3]})
msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
gb = df.groupby("a", axis=1)
with pytest.raises(ValueError, match="Cannot subset columns when using axis=1"):
gb["b"]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,963 @@
"""
test with the TimeGrouper / grouping with datetimes
"""
from datetime import (
datetime,
timedelta,
)
import numpy as np
import pytest
import pytz
import pandas as pd
from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
Series,
Timestamp,
date_range,
offsets,
)
import pandas._testing as tm
from pandas.core.groupby.grouper import Grouper
from pandas.core.groupby.ops import BinGrouper
@pytest.fixture
def frame_for_truncated_bingrouper():
"""
DataFrame used by groupby_with_truncated_bingrouper, made into
a separate fixture for easier reuse in
test_groupby_apply_timegrouper_with_nat_apply_squeeze
"""
df = DataFrame(
{
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
Timestamp(2013, 9, 1, 13, 0),
Timestamp(2013, 9, 1, 13, 5),
Timestamp(2013, 10, 1, 20, 0),
Timestamp(2013, 10, 3, 10, 0),
pd.NaT,
Timestamp(2013, 9, 2, 14, 0),
],
}
)
return df
@pytest.fixture
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
"""
GroupBy object such that gb._grouper is a BinGrouper and
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
Aggregations on this groupby should have
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
As either the index or an index level.
"""
df = frame_for_truncated_bingrouper
tdg = Grouper(key="Date", freq="5D")
gb = df.groupby(tdg)
# check we're testing the case we're interested in
assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq)
return gb
class TestGroupBy:
def test_groupby_with_timegrouper(self):
# GH 4161
# TimeGrouper requires a sorted index
# also verifies that the resultant index has the correct name
df_original = DataFrame(
{
"Buyer": "Carl Carl Carl Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
# GH 6908 change target column's order
df_reordered = df_original.sort_values(by="Quantity")
for df in [df_original, df_reordered]:
df = df.set_index(["Date"])
exp_dti = date_range(
"20130901",
"20131205",
freq="5D",
name="Date",
inclusive="left",
unit=df.index.unit,
)
expected = DataFrame(
{"Buyer": 0, "Quantity": 0},
index=exp_dti,
)
# Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
expected = expected.astype({"Buyer": object})
expected.iloc[0, 0] = "CarlCarlCarl"
expected.iloc[6, 0] = "CarlCarl"
expected.iloc[18, 0] = "Joe"
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
result1 = df.resample("5D").sum()
tm.assert_frame_equal(result1, expected)
df_sorted = df.sort_index()
result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result2, expected)
result3 = df.groupby(Grouper(freq="5D")).sum()
tm.assert_frame_equal(result3, expected)
@pytest.mark.parametrize("should_sort", [True, False])
def test_groupby_with_timegrouper_methods(self, should_sort):
# GH 3881
# make sure API of timegrouper conforms
df = DataFrame(
{
"Branch": "A A A A A B".split(),
"Buyer": "Carl Mark Carl Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 8, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
)
if should_sort:
df = df.sort_values(by="Quantity", ascending=False)
df = df.set_index("Date", drop=False)
g = df.groupby(Grouper(freq="6ME"))
assert g.group_keys
assert isinstance(g._grouper, BinGrouper)
groups = g.groups
assert isinstance(groups, dict)
assert len(groups) == 3
def test_timegrouper_with_reg_groups(self):
# GH 3794
# allow combination of timegrouper/reg groups
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 1, 1, 13, 0),
datetime(2013, 1, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 12, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
datetime(2013, 12, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
msg = "The default value of numeric_only"
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
expected = DataFrame(
{
"Buyer": "Carl Mark Carl Joe".split(),
"Quantity": [1, 3, 9, 18],
"Date": [
datetime(2013, 1, 1, 0, 0),
datetime(2013, 1, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
datetime(2013, 7, 1, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
df_original = DataFrame(
{
"Branch": "A A A A A A A B".split(),
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
"Date": [
datetime(2013, 10, 1, 13, 0),
datetime(2013, 10, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 2, 10, 0),
datetime(2013, 10, 2, 12, 0),
datetime(2013, 10, 2, 14, 0),
],
}
).set_index("Date")
df_sorted = df_original.sort_values(by="Quantity", ascending=False)
for df in [df_original, df_sorted]:
expected = DataFrame(
{
"Buyer": "Carl Joe Mark Carl Joe".split(),
"Quantity": [6, 8, 3, 4, 10],
"Date": [
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 1, 0, 0),
datetime(2013, 10, 2, 0, 0),
datetime(2013, 10, 2, 0, 0),
],
}
).set_index(["Date", "Buyer"])
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
datetime(2013, 10, 31, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# passing the name
df = df.reset_index()
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
# passing the level
df = df.set_index("Date")
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
with pytest.raises(ValueError, match="The level foo is not valid"):
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
# multi names
df = df.copy()
df["Date"] = df.index + offsets.MonthEnd(2)
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
numeric_only=True
)
expected = DataFrame(
{
"Buyer": "Carl Joe Mark".split(),
"Quantity": [10, 18, 3],
"Date": [
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
datetime(2013, 11, 30, 0, 0),
],
}
).set_index(["Date", "Buyer"])
tm.assert_frame_equal(result, expected)
# error as we have both a level and a name!
msg = "The Grouper cannot specify both a key and a level!"
with pytest.raises(ValueError, match=msg):
df.groupby(
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
).sum()
# single groupers
expected = DataFrame(
[[31]],
columns=["Quantity"],
index=DatetimeIndex(
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
),
)
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
expected.index = expected.index.shift(1)
assert expected.index.freq == offsets.MonthEnd()
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
tm.assert_frame_equal(result, expected)
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
numeric_only=True
)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
def test_timegrouper_with_reg_groups_freq(self, freq):
# GH 6764 multiple grouping with/without sort
df = DataFrame(
{
"date": pd.to_datetime(
[
"20121002",
"20121007",
"20130130",
"20130202",
"20130305",
"20121002",
"20121207",
"20130130",
"20130202",
"20130305",
"20130202",
"20130305",
]
),
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
"whole_cost": [
1790,
364,
280,
259,
201,
623,
90,
312,
359,
301,
359,
801,
],
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
}
).set_index("date")
expected = (
df.groupby("user_id")["whole_cost"]
.resample(freq)
.sum(min_count=1) # XXX
.dropna()
.reorder_levels(["date", "user_id"])
.sort_index()
.astype("int64")
)
expected.name = "whole_cost"
result1 = (
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
)
tm.assert_series_equal(result1, expected)
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
tm.assert_series_equal(result2, expected)
def test_timegrouper_get_group(self):
# GH 6914
df_original = DataFrame(
{
"Buyer": "Carl Joe Joe Carl Joe Carl".split(),
"Quantity": [18, 3, 5, 1, 9, 3],
"Date": [
datetime(2013, 9, 1, 13, 0),
datetime(2013, 9, 1, 13, 5),
datetime(2013, 10, 1, 20, 0),
datetime(2013, 10, 3, 10, 0),
datetime(2013, 12, 2, 12, 0),
datetime(2013, 9, 2, 14, 0),
],
}
)
df_reordered = df_original.sort_values(by="Quantity")
# single grouping
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="ME", key="Date"))
for t, expected in zip(dt_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
# multiple grouping
expected_list = [
df_original.iloc[[1]],
df_original.iloc[[3]],
df_original.iloc[[4]],
]
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
for df in [df_original, df_reordered]:
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
for (b, t), expected in zip(g_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group((b, dt))
tm.assert_frame_equal(result, expected)
# with index
df_original = df_original.set_index("Date")
df_reordered = df_original.sort_values(by="Quantity")
expected_list = [
df_original.iloc[[0, 1, 5]],
df_original.iloc[[2, 3]],
df_original.iloc[[4]],
]
for df in [df_original, df_reordered]:
grouped = df.groupby(Grouper(freq="ME"))
for t, expected in zip(dt_list, expected_list):
dt = Timestamp(t)
result = grouped.get_group(dt)
tm.assert_frame_equal(result, expected)
def test_timegrouper_apply_return_type_series(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_series(x):
return Series([x["value"].sum()], ("sum",))
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
tm.assert_frame_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_timegrouper_apply_return_type_value(self):
# Using `apply` with the `TimeGrouper` should give the
# same return type as an `apply` with a `Grouper`.
# Issue #11742
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
df_dt = df.copy()
df_dt["date"] = pd.to_datetime(df_dt["date"])
def sumfunc_value(x):
return x.value.sum()
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
with tm.assert_produces_warning(DeprecationWarning, match=msg):
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
tm.assert_series_equal(
result.reset_index(drop=True), expected.reset_index(drop=True)
)
def test_groupby_groups_datetimeindex(self):
# GH#1430
periods = 1000
ind = date_range(start="2012/1/1", freq="5min", periods=periods)
df = DataFrame(
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind
)
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
# it works!
groups = grouped.groups
assert isinstance(next(iter(groups.keys())), datetime)
def test_groupby_groups_datetimeindex2(self):
# GH#11442
index = date_range("2015/01/01", periods=5, name="date")
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
result = df.groupby(level="date").groups
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
expected = {
Timestamp(date): DatetimeIndex([date], name="date") for date in dates
}
tm.assert_dict_equal(result, expected)
grouped = df.groupby(level="date")
for date in dates:
result = grouped.get_group(date)
data = [[df.loc[date, "A"], df.loc[date, "B"]]]
expected_index = DatetimeIndex(
[date], name="date", freq="D", dtype=index.dtype
)
expected = DataFrame(data, columns=list("AB"), index=expected_index)
tm.assert_frame_equal(result, expected)
def test_groupby_groups_datetimeindex_tz(self):
# GH 3950
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"datetime": dates,
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
exp_idx1 = DatetimeIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
tz="US/Pacific",
name="datetime",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["datetime", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = DatetimeIndex(dates, tz="Asia/Tokyo")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = DatetimeIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
tz="Asia/Tokyo",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_frame_datetime64_handling_groupby(self):
# it works!
df = DataFrame(
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
columns=["a", "date"],
)
result = df.groupby("a").first()
assert result["date"][3] == Timestamp("2012-07-03")
def test_groupby_multi_timezone(self):
# combining multiple / different timezones yields UTC
df = DataFrame(
{
"value": range(5),
"date": [
"2000-01-28 16:47:00",
"2000-01-29 16:48:00",
"2000-01-30 16:49:00",
"2000-01-31 16:50:00",
"2000-01-01 16:50:00",
],
"tz": [
"America/Chicago",
"America/Chicago",
"America/Los_Angeles",
"America/Chicago",
"America/New_York",
],
}
)
result = df.groupby("tz", group_keys=False).date.apply(
lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
)
expected = Series(
[
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
],
name="date",
dtype=object,
)
tm.assert_series_equal(result, expected)
tz = "America/Chicago"
res_values = df.groupby("tz").date.get_group(tz)
result = pd.to_datetime(res_values).dt.tz_localize(tz)
exp_values = Series(
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
index=[0, 1, 3],
name="date",
)
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
tm.assert_series_equal(result, expected)
def test_groupby_groups_periods(self):
dates = [
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
]
df = DataFrame(
{
"label": ["a", "a", "a", "b", "b", "b"],
"period": [pd.Period(d, freq="h") for d in dates],
"value1": np.arange(6, dtype="int64"),
"value2": [1, 2] * 3,
}
)
exp_idx1 = pd.PeriodIndex(
[
"2011-07-19 07:00:00",
"2011-07-19 07:00:00",
"2011-07-19 08:00:00",
"2011-07-19 08:00:00",
"2011-07-19 09:00:00",
"2011-07-19 09:00:00",
],
freq="h",
name="period",
)
exp_idx2 = Index(["a", "b"] * 3, name="label")
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame(
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(["period", "label"]).sum()
tm.assert_frame_equal(result, expected)
# by level
didx = pd.PeriodIndex(dates, freq="h")
df = DataFrame(
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
index=didx,
)
exp_idx = pd.PeriodIndex(
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
freq="h",
)
expected = DataFrame(
{"value1": [3, 5, 7], "value2": [2, 4, 6]},
index=exp_idx,
columns=["value1", "value2"],
)
result = df.groupby(level=0).sum()
tm.assert_frame_equal(result, expected)
def test_groupby_first_datetime64(self):
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
df[1] = df[1].astype("M8[ns]")
assert issubclass(df[1].dtype.type, np.datetime64)
result = df.groupby(level=0).first()
got_dt = result[1].dtype
assert issubclass(got_dt.type, np.datetime64)
result = df[1].groupby(level=0).first()
got_dt = result.dtype
assert issubclass(got_dt.type, np.datetime64)
def test_groupby_max_datetime64(self):
# GH 5869
# datetimelike dtype conversion from int
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
# TODO: can we retain second reso in .apply here?
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
result = df.groupby("A")["A"].max()
tm.assert_series_equal(result, expected)
def test_groupby_datetime64_32_bit(self):
# GH 6410 / numpy 4328
# 32-bit under 1.9-dev indexing issue
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
result = df.groupby("A")["B"].transform("min")
expected = Series([Timestamp("2000-01-1")] * 2, name="B")
tm.assert_series_equal(result, expected)
def test_groupby_with_timezone_selection(self):
# GH 11616
# Test that column selection returns output in correct timezone.
df = DataFrame(
{
"factor": np.random.default_rng(2).integers(0, 3, size=60),
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
}
)
df1 = df.groupby("factor").max()["time"]
df2 = df.groupby("factor")["time"].max()
tm.assert_series_equal(df1, df2)
def test_timezone_info(self):
# see gh-11682: Timezone info lost when broadcasting
# scalar datetime to DataFrame
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
assert df["b"][0].tzinfo == pytz.utc
df = DataFrame({"a": [1, 2, 3]})
df["b"] = datetime.now(pytz.utc)
assert df["b"][0].tzinfo == pytz.utc
def test_datetime_count(self):
df = DataFrame(
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
)
result = df.groupby("a").dates.count()
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
tm.assert_series_equal(result, expected)
def test_first_last_max_min_on_time_data(self):
# GH 10295
# Verify that NaT is not in the result of max, min, first and last on
# Dataframe with datetime or timedelta values.
df_test = DataFrame(
{
"dt": [
np.nan,
"2015-07-24 10:10",
"2015-07-25 11:11",
"2015-07-23 12:12",
np.nan,
],
"td": [
np.nan,
timedelta(days=1),
timedelta(days=2),
timedelta(days=3),
np.nan,
],
}
)
df_test.dt = pd.to_datetime(df_test.dt)
df_test["group"] = "A"
df_ref = df_test[df_test.dt.notna()]
grouped_test = df_test.groupby("group")
grouped_ref = df_ref.groupby("group")
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
def test_nunique_with_timegrouper_and_nat(self):
# GH 17575
test = DataFrame(
{
"time": [
Timestamp("2016-06-28 09:35:35"),
pd.NaT,
Timestamp("2016-06-28 16:46:28"),
],
"data": ["1", "2", "3"],
}
)
grouper = Grouper(key="time", freq="h")
result = test.groupby(grouper)["data"].nunique()
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected)
def test_scalar_call_versus_list_call(self):
# Issue: 17530
data_frame = {
"location": ["shanghai", "beijing", "shanghai"],
"time": Series(
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
dtype="datetime64[ns]",
),
"value": [1, 2, 3],
}
data_frame = DataFrame(data_frame).set_index("time")
grouper = Grouper(freq="D")
grouped = data_frame.groupby(grouper)
result = grouped.count()
grouped = data_frame.groupby([grouper])
expected = grouped.count()
tm.assert_frame_equal(result, expected)
def test_grouper_period_index(self):
# GH 32108
periods = 2
index = pd.period_range(
start="2018-01", periods=periods, freq="M", name="Month"
)
period_series = Series(range(periods), index=index)
result = period_series.groupby(period_series.index.month).sum()
expected = Series(
range(periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
def test_groupby_apply_timegrouper_with_nat_dict_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
# have different lengths that goes through the `isinstance(values[0], dict)`
# path
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
df = gb.obj
unit = df["Date"]._values.unit
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_scalar_returns(
self, groupby_with_truncated_bingrouper
):
# GH#43500 Previously raised ValueError bc used index with incorrect
# length in wrap_applied_result
gb = groupby_with_truncated_bingrouper
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
df = gb.obj
unit = df["Date"]._values.unit
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
expected = Series(
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
index=dti._with_freq(None),
name="Quantity",
)
tm.assert_series_equal(res, expected)
def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
self, frame_for_truncated_bingrouper
):
df = frame_for_truncated_bingrouper
# We need to create a GroupBy object with only one non-NaT group,
# so use a huge freq so that all non-NaT dates will be grouped together
tdg = Grouper(key="Date", freq="100YE")
gb = df.groupby(tdg)
# check that we will go through the singular_series path
# in _wrap_applied_output_series
assert gb.ngroups == 1
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
# function that returns a Series
msg = "DataFrameGroupBy.apply operated on the grouping columns"
with tm.assert_produces_warning(DeprecationWarning, match=msg):
res = gb.apply(lambda x: x["Quantity"] * 2)
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
expected = DataFrame(
[[36, 6, 6, 10, 2]],
index=dti,
columns=Index([0, 1, 5, 2, 3], name="Quantity"),
)
tm.assert_frame_equal(res, expected)
@pytest.mark.single_cpu
def test_groupby_agg_numba_timegrouper_with_nat(
self, groupby_with_truncated_bingrouper
):
pytest.importorskip("numba")
# See discussion in GH#43487
gb = groupby_with_truncated_bingrouper
result = gb["Quantity"].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected = gb["Quantity"].aggregate("mean")
tm.assert_series_equal(result, expected)
result_df = gb[["Quantity"]].aggregate(
lambda values, index: np.nanmean(values), engine="numba"
)
expected_df = gb[["Quantity"]].aggregate("mean")
tm.assert_frame_equal(result_df, expected_df)

View File

@ -0,0 +1,284 @@
import numpy as np
import pytest
from pandas.errors import NumbaUtilError
from pandas import (
DataFrame,
Series,
option_context,
)
import pandas._testing as tm
pytestmark = pytest.mark.single_cpu
def test_correct_function_signature():
pytest.importorskip("numba")
def incorrect_function(x):
return x + 1
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key").transform(incorrect_function, engine="numba")
with pytest.raises(NumbaUtilError, match="The first 2"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba")
def test_check_nopython_kwargs():
pytest.importorskip("numba")
def incorrect_function(values, index):
return values + 1
data = DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key").transform(incorrect_function, engine="numba", a=1)
with pytest.raises(NumbaUtilError, match="numba does not support"):
data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
@pytest.mark.parametrize("as_index", [True, False])
def test_numba_vs_cython(jit, pandas_obj, nogil, parallel, nopython, as_index):
pytest.importorskip("numba")
def func(values, index):
return values + 1
if jit:
# Test accepted jitted functions
import numba
func = numba.jit(func)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0, as_index=as_index)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.transform(func, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
# Filter warnings when parallel=True and the function can't be parallelized by Numba
@pytest.mark.parametrize("jit", [True, False])
@pytest.mark.parametrize("pandas_obj", ["Series", "DataFrame"])
def test_cache(jit, pandas_obj, nogil, parallel, nopython):
# Test that the functions are cached correctly if we switch functions
pytest.importorskip("numba")
def func_1(values, index):
return values + 1
def func_2(values, index):
return values * 5
if jit:
import numba
func_1 = numba.jit(func_1)
func_2 = numba.jit(func_2)
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
if pandas_obj == "Series":
grouped = grouped[1]
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
result = grouped.transform(func_2, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x * 5, engine="cython")
tm.assert_equal(result, expected)
# Retest func_1 which should use the cache
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
def test_use_global_config():
pytest.importorskip("numba")
def func_1(values, index):
return values + 1
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
expected = grouped.transform(func_1, engine="numba")
with option_context("compute.use_numba", True):
result = grouped.transform(func_1, engine=None)
tm.assert_frame_equal(expected, result)
# TODO: Test more than just reductions (e.g. actually test transformations once we have
@pytest.mark.parametrize(
"agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
)
def test_string_cython_vs_numba(agg_func, numba_supported_reductions):
pytest.importorskip("numba")
agg_func, kwargs = numba_supported_reductions
data = DataFrame(
{0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
grouped = data.groupby(0)
result = grouped.transform(agg_func, engine="numba", **kwargs)
expected = grouped.transform(agg_func, engine="cython", **kwargs)
tm.assert_frame_equal(result, expected)
result = grouped[1].transform(agg_func, engine="numba", **kwargs)
expected = grouped[1].transform(agg_func, engine="cython", **kwargs)
tm.assert_series_equal(result, expected)
def test_args_not_cached():
# GH 41647
pytest.importorskip("numba")
def sum_last(values, index, n):
return values[-n:].sum()
df = DataFrame({"id": [0, 0, 1, 1], "x": [1, 1, 1, 1]})
grouped_x = df.groupby("id")["x"]
result = grouped_x.transform(sum_last, 1, engine="numba")
expected = Series([1.0] * 4, name="x")
tm.assert_series_equal(result, expected)
result = grouped_x.transform(sum_last, 2, engine="numba")
expected = Series([2.0] * 4, name="x")
tm.assert_series_equal(result, expected)
def test_index_data_correctly_passed():
# GH 43133
pytest.importorskip("numba")
def f(values, index):
return index - 1
df = DataFrame({"group": ["A", "A", "B"], "v": [4, 5, 6]}, index=[-1, -2, -3])
result = df.groupby("group").transform(f, engine="numba")
expected = DataFrame([-4.0, -3.0, -2.0], columns=["v"], index=[-1, -2, -3])
tm.assert_frame_equal(result, expected)
def test_engine_kwargs_not_cached():
# If the user passes a different set of engine_kwargs don't return the same
# jitted function
pytest.importorskip("numba")
nogil = True
parallel = False
nopython = True
def func_kwargs(values, index):
return nogil + parallel + nopython
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
df = DataFrame({"value": [0, 0, 0]})
result = df.groupby(level=0).transform(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [2.0, 2.0, 2.0]})
tm.assert_frame_equal(result, expected)
nogil = False
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby(level=0).transform(
func_kwargs, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame({"value": [1.0, 1.0, 1.0]})
tm.assert_frame_equal(result, expected)
@pytest.mark.filterwarnings("ignore")
def test_multiindex_one_key(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
result = df.groupby("A").transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
expected = DataFrame([{"A": 1, "B": 2, "C": 1.0}]).set_index(["A", "B"])
tm.assert_frame_equal(result, expected)
def test_multiindex_multi_key_not_supported(nogil, parallel, nopython):
pytest.importorskip("numba")
def numba_func(values, index):
return 1
df = DataFrame([{"A": 1, "B": 2, "C": 3}]).set_index(["A", "B"])
engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
with pytest.raises(NotImplementedError, match="more than 1 grouping labels"):
df.groupby(["A", "B"]).transform(
numba_func, engine="numba", engine_kwargs=engine_kwargs
)
def test_multilabel_numba_vs_cython(numba_supported_reductions):
pytest.importorskip("numba")
reduction, kwargs = numba_supported_reductions
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
res_agg = gb.transform(reduction, engine="numba", **kwargs)
expected_agg = gb.transform(reduction, engine="cython", **kwargs)
tm.assert_frame_equal(res_agg, expected_agg)
def test_multilabel_udf_numba_vs_cython():
pytest.importorskip("numba")
df = DataFrame(
{
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
"C": np.random.default_rng(2).standard_normal(8),
"D": np.random.default_rng(2).standard_normal(8),
}
)
gb = df.groupby(["A", "B"])
result = gb.transform(
lambda values, index: (values - values.min()) / (values.max() - values.min()),
engine="numba",
)
expected = gb.transform(
lambda x: (x - x.min()) / (x.max() - x.min()), engine="cython"
)
tm.assert_frame_equal(result, expected)