venv
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,24 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_corrwith_with_1_axis():
|
||||
# GH 47723
|
||||
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
|
||||
gb = df.groupby("a")
|
||||
|
||||
msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.corrwith(df, axis=1)
|
||||
index = Index(
|
||||
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
|
||||
name=("a", None),
|
||||
)
|
||||
expected = Series([np.nan] * 6, index=index)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,297 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_apply_describe_bug(multiindex_dataframe_random_data):
|
||||
grouped = multiindex_dataframe_random_data.groupby(level="first")
|
||||
grouped.describe() # it works!
|
||||
|
||||
|
||||
def test_series_describe_multikey():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
|
||||
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
|
||||
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
|
||||
|
||||
|
||||
def test_series_describe_single():
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
grouped = ts.groupby(lambda x: x.month)
|
||||
result = grouped.apply(lambda x: x.describe())
|
||||
expected = grouped.describe().stack(future_stack=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
|
||||
def test_series_describe_as_index(as_index, keys):
|
||||
# GH#49256
|
||||
df = DataFrame(
|
||||
{
|
||||
"key1": ["one", "two", "two", "three", "two"],
|
||||
"key2": ["one", "two", "two", "three", "two"],
|
||||
"foo2": [1, 2, 4, 4, 6],
|
||||
}
|
||||
)
|
||||
gb = df.groupby(keys, as_index=as_index)["foo2"]
|
||||
result = gb.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"key1": ["one", "three", "two"],
|
||||
"count": [1.0, 1.0, 3.0],
|
||||
"mean": [1.0, 4.0, 4.0],
|
||||
"std": [np.nan, np.nan, 2.0],
|
||||
"min": [1.0, 4.0, 2.0],
|
||||
"25%": [1.0, 4.0, 3.0],
|
||||
"50%": [1.0, 4.0, 4.0],
|
||||
"75%": [1.0, 4.0, 5.0],
|
||||
"max": [1.0, 4.0, 6.0],
|
||||
}
|
||||
)
|
||||
if len(keys) == 2:
|
||||
expected.insert(1, "key2", expected["key1"])
|
||||
if as_index:
|
||||
expected = expected.set_index(keys)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_multikey(tsframe):
|
||||
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
||||
result = grouped.describe()
|
||||
desc_groups = []
|
||||
for col in tsframe:
|
||||
group = grouped[col].describe()
|
||||
# GH 17464 - Remove duplicate MultiIndex levels
|
||||
group_col = MultiIndex(
|
||||
levels=[[col], group.columns],
|
||||
codes=[[0] * len(group.columns), range(len(group.columns))],
|
||||
)
|
||||
group = DataFrame(group.values, columns=group_col, index=group.index)
|
||||
desc_groups.append(group)
|
||||
expected = pd.concat(desc_groups, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
|
||||
result = groupedT.describe()
|
||||
expected = tsframe.describe().T
|
||||
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
|
||||
expected.index = MultiIndex(
|
||||
levels=[[0, 1], expected.index],
|
||||
codes=[[0, 0, 1, 1], range(len(expected.index))],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_frame_describe_tupleindex():
|
||||
# GH 14848 - regression from 0.19.0 to 0.19.1
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"x": [1, 2, 3, 4, 5] * 3,
|
||||
"y": [10, 20, 30, 40, 50] * 3,
|
||||
"z": [100, 200, 300, 400, 500] * 3,
|
||||
}
|
||||
)
|
||||
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
|
||||
df2 = df1.rename(columns={"k": "key"})
|
||||
msg = "Names should be list-like for a MultiIndex"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1.groupby("k").describe()
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df2.groupby("key").describe()
|
||||
|
||||
|
||||
def test_frame_describe_unstacked_format():
|
||||
# GH 4792
|
||||
prices = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
|
||||
}
|
||||
volumes = {
|
||||
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
|
||||
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
|
||||
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
|
||||
}
|
||||
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
|
||||
result = df.groupby("PRICE").VOLUME.describe()
|
||||
data = [
|
||||
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
|
||||
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
|
||||
]
|
||||
expected = DataFrame(
|
||||
data,
|
||||
index=Index([24990, 25499], name="PRICE"),
|
||||
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:"
|
||||
"indexing past lexsort depth may impact performance:"
|
||||
"pandas.errors.PerformanceWarning"
|
||||
)
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
|
||||
def test_describe_with_duplicate_output_column_names(as_index, keys):
|
||||
# GH 35314
|
||||
df = DataFrame(
|
||||
{
|
||||
"a1": [99, 99, 99, 88, 88, 88],
|
||||
"a2": [99, 99, 99, 88, 88, 88],
|
||||
"b": [1, 2, 3, 4, 5, 6],
|
||||
"c": [10, 20, 30, 40, 50, 60],
|
||||
},
|
||||
columns=["a1", "a2", "b", "b"],
|
||||
copy=False,
|
||||
)
|
||||
if keys == ["a1"]:
|
||||
df = df.drop(columns="a2")
|
||||
|
||||
expected = (
|
||||
DataFrame.from_records(
|
||||
[
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
("b", "count", 3.0, 3.0),
|
||||
("b", "mean", 5.0, 2.0),
|
||||
("b", "std", 1.0, 1.0),
|
||||
("b", "min", 4.0, 1.0),
|
||||
("b", "25%", 4.5, 1.5),
|
||||
("b", "50%", 5.0, 2.0),
|
||||
("b", "75%", 5.5, 2.5),
|
||||
("b", "max", 6.0, 3.0),
|
||||
],
|
||||
)
|
||||
.set_index([0, 1])
|
||||
.T
|
||||
)
|
||||
expected.columns.names = [None, None]
|
||||
if len(keys) == 2:
|
||||
expected.index = MultiIndex(
|
||||
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
|
||||
)
|
||||
else:
|
||||
expected.index = Index([88, 99], name="a1")
|
||||
|
||||
if not as_index:
|
||||
expected = expected.reset_index()
|
||||
|
||||
result = df.groupby(keys, as_index=as_index).describe()
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_describe_duplicate_columns():
|
||||
# GH#50806
|
||||
df = DataFrame([[0, 1, 2, 3]])
|
||||
df.columns = [0, 1, 2, 0]
|
||||
gb = df.groupby(df[1])
|
||||
result = gb.describe(percentiles=[])
|
||||
|
||||
columns = ["count", "mean", "std", "min", "50%", "max"]
|
||||
frames = [
|
||||
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
|
||||
for val in (0.0, 2.0, 3.0)
|
||||
]
|
||||
expected = pd.concat(frames, axis=1)
|
||||
expected.columns = MultiIndex(
|
||||
levels=[[0, 2], columns],
|
||||
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
|
||||
)
|
||||
expected.index.names = [1]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestGroupByNonCythonPaths:
|
||||
# GH#5610 non-cython calls should not include the grouper
|
||||
# Tests for code not expected to go through cython paths.
|
||||
|
||||
@pytest.fixture
|
||||
def df(self):
|
||||
df = DataFrame(
|
||||
[[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
return df
|
||||
|
||||
@pytest.fixture
|
||||
def gb(self, df):
|
||||
gb = df.groupby("A")
|
||||
return gb
|
||||
|
||||
@pytest.fixture
|
||||
def gni(self, df):
|
||||
gni = df.groupby("A", as_index=False)
|
||||
return gni
|
||||
|
||||
def test_describe(self, df, gb, gni):
|
||||
# describe
|
||||
expected_index = Index([1, 3], name="A")
|
||||
expected_col = MultiIndex(
|
||||
levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
|
||||
codes=[[0] * 8, list(range(8))],
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
|
||||
[0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
],
|
||||
index=expected_index,
|
||||
columns=expected_col,
|
||||
)
|
||||
result = gb.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = expected.reset_index()
|
||||
result = gni.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [int, float, object])
|
||||
@pytest.mark.parametrize(
|
||||
"kwargs",
|
||||
[
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
|
||||
{"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
|
||||
],
|
||||
)
|
||||
def test_groupby_empty_dataset(dtype, kwargs):
|
||||
# GH#41575
|
||||
df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
|
||||
df["B"] = df["B"].astype(int)
|
||||
df["C"] = df["C"].astype(float)
|
||||
|
||||
result = df.iloc[:0].groupby("A").describe(**kwargs)
|
||||
expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].groupby("A").B.describe(**kwargs)
|
||||
expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
|
||||
expected.index = Index([])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,255 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_group_shift_with_null_key():
|
||||
# This test is designed to replicate the segfault in issue #13813.
|
||||
n_rows = 1200
|
||||
|
||||
# Generate a moderately large dataframe with occasional missing
|
||||
# values in column `B`, and then group by [`A`, `B`]. This should
|
||||
# force `-1` in `labels` array of `g._grouper.group_info` exactly
|
||||
# at those places, where the group-by key is partially missing.
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i % 3 and i < n_rows - 12 else np.nan) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_with_fill_value():
|
||||
# GH #24128
|
||||
n_rows = 24
|
||||
df = DataFrame(
|
||||
[(i % 12, i % 3, i) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["A", "B", "Z"],
|
||||
index=None,
|
||||
)
|
||||
g = df.groupby(["A", "B"])
|
||||
|
||||
expected = DataFrame(
|
||||
[(i + 12 if i < n_rows - 12 else 0) for i in range(n_rows)],
|
||||
dtype=float,
|
||||
columns=["Z"],
|
||||
index=None,
|
||||
)
|
||||
result = g.shift(-1, fill_value=0)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_shift_lose_timezone():
|
||||
# GH 30134
|
||||
now_dt = Timestamp.utcnow().as_unit("ns")
|
||||
df = DataFrame({"a": [1, 1], "date": now_dt})
|
||||
result = df.groupby("a").shift(0).iloc[0]
|
||||
expected = Series({"date": now_dt}, name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_series(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = Series([np.nan, np.nan, np.nan, 1.0, 3.0], dtype=exp_dtype, name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_real_frame(any_real_numpy_dtype):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3, 3, 2],
|
||||
"b": [1, 2, 3, 4, 5],
|
||||
"c": [1, 2, 3, 4, 6],
|
||||
},
|
||||
dtype=any_real_numpy_dtype,
|
||||
)
|
||||
result = df.groupby("a").diff()
|
||||
exp_dtype = "float"
|
||||
if any_real_numpy_dtype in ["int8", "int16", "float32"]:
|
||||
exp_dtype = "float32"
|
||||
expected = DataFrame(
|
||||
{
|
||||
"b": [np.nan, np.nan, np.nan, 1.0, 3.0],
|
||||
"c": [np.nan, np.nan, np.nan, 1.0, 4.0],
|
||||
},
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01"),
|
||||
Timestamp("2013-01-02"),
|
||||
Timestamp("2013-01-03"),
|
||||
],
|
||||
[Timedelta("5 days"), Timedelta("6 days"), Timedelta("7 days")],
|
||||
],
|
||||
)
|
||||
def test_group_diff_datetimelike(data, unit):
|
||||
df = DataFrame({"a": [1, 2, 2], "b": data})
|
||||
df["b"] = df["b"].dt.as_unit(unit)
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([NaT, NaT, Timedelta("1 days")], name="b").dt.as_unit(unit)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_bool():
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
result = df.groupby("a")["b"].diff()
|
||||
expected = Series([np.nan, np.nan, np.nan, False, False], name="b")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_diff_object_raises(object_dtype):
|
||||
df = DataFrame(
|
||||
{"a": ["foo", "bar", "bar"], "b": ["baz", "foo", "foo"]}, dtype=object_dtype
|
||||
)
|
||||
with pytest.raises(TypeError, match=r"unsupported operand type\(s\) for -"):
|
||||
df.groupby("a")["b"].diff()
|
||||
|
||||
|
||||
def test_empty_shift_with_fill():
|
||||
# GH 41264, single-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_multindex_empty_shift_with_fill():
|
||||
# GH 41264, multi-index check
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
shifted = df.groupby(["a", "b"]).shift(1)
|
||||
shifted_with_fill = df.groupby(["a", "b"]).shift(1, fill_value=0)
|
||||
tm.assert_frame_equal(shifted, shifted_with_fill)
|
||||
tm.assert_index_equal(shifted.index, shifted_with_fill.index)
|
||||
|
||||
|
||||
def test_shift_periods_freq():
|
||||
# GH 54093
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
result = df.groupby(df.index).shift(periods=-2, freq="D")
|
||||
expected = DataFrame(data, index=date_range(start="2009-12-30", periods=6))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_shift_deprecate_freq_and_fill_value():
|
||||
# GH 53832
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data, index=date_range(start="20100101", periods=6))
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby(df.index).shift(periods=-2, freq="D", fill_value="1")
|
||||
|
||||
|
||||
def test_shift_disallow_suffix_if_periods_is_int():
|
||||
# GH#44424
|
||||
data = {"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]}
|
||||
df = DataFrame(data)
|
||||
msg = "Cannot specify `suffix` if `periods` is an int."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("b").shift(1, suffix="fails")
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods():
|
||||
# GH#44424
|
||||
df = DataFrame({"a": [1, 2, 3, 3, 2], "b": [True, True, False, False, True]})
|
||||
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1])
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 3, 2], "a_1": [np.nan, 1.0, np.nan, 3.0, 2.0]}
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
# series
|
||||
shifted_series = df.groupby("b")["a"].shift([0, 1])
|
||||
tm.assert_frame_equal(shifted_series, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_freq():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift(
|
||||
[0, 1],
|
||||
freq="h",
|
||||
)
|
||||
expected_df = DataFrame(
|
||||
{
|
||||
"a_0": [1.0, 2.0, 3.0, 4.0, 5.0, np.nan],
|
||||
"a_1": [
|
||||
np.nan,
|
||||
1.0,
|
||||
2.0,
|
||||
3.0,
|
||||
4.0,
|
||||
5.0,
|
||||
],
|
||||
},
|
||||
index=date_range("1/1/2000", periods=6, freq="h"),
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_fill_value():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
)
|
||||
shifted_df = df.groupby("b")[["a"]].shift([0, 1], fill_value=-1)
|
||||
expected_df = DataFrame(
|
||||
{"a_0": [1, 2, 3, 4, 5], "a_1": [-1, 1, -1, 3, 2]},
|
||||
)
|
||||
tm.assert_frame_equal(shifted_df, expected_df)
|
||||
|
||||
|
||||
def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated():
|
||||
# GH#44424
|
||||
df = DataFrame(
|
||||
{"a": [1, 2, 3, 4, 5], "b": [True, True, False, False, True]},
|
||||
index=date_range("1/1/2000", periods=5, freq="h"),
|
||||
)
|
||||
msg = (
|
||||
"Passing a 'freq' together with a 'fill_value' silently ignores the "
|
||||
"fill_value"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h")
|
@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly increasing (T), strictly decreasing (F),
|
||||
# abs val increasing (F), non-strictly increasing (T)
|
||||
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
|
||||
[True, False, True, False],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_increasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_increasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# Also check result equal to manually taking x.is_monotonic_increasing.
|
||||
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"in_vals, out_vals",
|
||||
[
|
||||
# Basics: strictly decreasing (T), strictly increasing (F),
|
||||
# abs val decreasing (F), non-strictly increasing (T)
|
||||
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
|
||||
# Test with inf vals
|
||||
(
|
||||
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
|
||||
[True, True, False, True],
|
||||
),
|
||||
# Test with nan vals; should always be False
|
||||
(
|
||||
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
|
||||
[False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_is_monotonic_decreasing(in_vals, out_vals):
|
||||
# GH 17015
|
||||
source_dict = {
|
||||
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
|
||||
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
|
||||
"C": in_vals,
|
||||
}
|
||||
|
||||
df = DataFrame(source_dict)
|
||||
result = df.groupby("B").C.is_monotonic_decreasing
|
||||
index = Index(list("abcd"), name="B")
|
||||
expected = Series(index=index, data=out_vals, name="C")
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,115 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
MultiIndex,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_nlargest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nlargest(3)
|
||||
e = Series(
|
||||
[7, 5, 3, 10, 9, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[3, 2, 1, 3, 3, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
|
||||
|
||||
|
||||
def test_nlargest_mi_grouper():
|
||||
# see gh-21411
|
||||
npr = np.random.default_rng(2)
|
||||
|
||||
dts = date_range("20180101", periods=10)
|
||||
iterables = [dts, ["one", "two"]]
|
||||
|
||||
idx = MultiIndex.from_product(iterables, names=["first", "second"])
|
||||
s = Series(npr.standard_normal(20), index=idx)
|
||||
|
||||
result = s.groupby("first").nlargest(1)
|
||||
|
||||
exp_idx = MultiIndex.from_tuples(
|
||||
[
|
||||
(dts[0], dts[0], "one"),
|
||||
(dts[1], dts[1], "one"),
|
||||
(dts[2], dts[2], "one"),
|
||||
(dts[3], dts[3], "two"),
|
||||
(dts[4], dts[4], "one"),
|
||||
(dts[5], dts[5], "one"),
|
||||
(dts[6], dts[6], "one"),
|
||||
(dts[7], dts[7], "one"),
|
||||
(dts[8], dts[8], "one"),
|
||||
(dts[9], dts[9], "one"),
|
||||
],
|
||||
names=["first", "first", "second"],
|
||||
)
|
||||
|
||||
exp_values = [
|
||||
0.18905338179353307,
|
||||
-0.41306354339189344,
|
||||
1.799707382720902,
|
||||
0.7738065867276614,
|
||||
0.28121066979764925,
|
||||
0.9775674511260357,
|
||||
-0.3288239040579627,
|
||||
0.45495807124085547,
|
||||
0.5452887139646817,
|
||||
0.12682784711186987,
|
||||
]
|
||||
|
||||
expected = Series(exp_values, index=exp_idx)
|
||||
tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
|
||||
|
||||
|
||||
def test_nsmallest():
|
||||
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
|
||||
b = Series(list("a" * 5 + "b" * 5))
|
||||
gb = a.groupby(b)
|
||||
r = gb.nsmallest(3)
|
||||
e = Series(
|
||||
[1, 2, 3, 0, 4, 6],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
|
||||
)
|
||||
tm.assert_series_equal(r, e)
|
||||
|
||||
a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
|
||||
gb = a.groupby(b)
|
||||
e = Series(
|
||||
[0, 1, 1, 0, 1, 2],
|
||||
index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
|
||||
)
|
||||
tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, groups",
|
||||
[([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
|
||||
@pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
|
||||
def test_nlargest_and_smallest_noop(data, groups, dtype, method):
|
||||
# GH 15272, GH 16345, GH 29129
|
||||
# Test nlargest/smallest when it results in a noop,
|
||||
# i.e. input is sorted and group size <= n
|
||||
if dtype is not None:
|
||||
data = np.array(data, dtype=dtype)
|
||||
if method == "nlargest":
|
||||
data = list(reversed(data))
|
||||
ser = Series(data, name="a")
|
||||
result = getattr(ser.groupby(groups), method)(n=2)
|
||||
expidx = np.array(groups, dtype=int) if isinstance(groups, list) else groups
|
||||
expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,921 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_first_last_nth(df):
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
nth = grouped.nth(0)
|
||||
expected = df.loc[[0, 1]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(-1)
|
||||
expected = df.iloc[[5, 7]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
# it works!
|
||||
grouped["B"].first()
|
||||
grouped["B"].last()
|
||||
grouped["B"].nth(0)
|
||||
|
||||
df = df.copy()
|
||||
df.loc[df["A"] == "foo", "B"] = np.nan
|
||||
grouped = df.groupby("A")
|
||||
assert isna(grouped["B"].first()["foo"])
|
||||
assert isna(grouped["B"].last()["foo"])
|
||||
assert isna(grouped["B"].nth(0).iloc[0])
|
||||
|
||||
# v0.14.0 whatsnew
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A")
|
||||
result = g.first()
|
||||
expected = df.iloc[[1, 2]].set_index("A")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df.iloc[[1, 2]]
|
||||
result = g.nth(0, dropna="any")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_na_object(method, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
|
||||
result = getattr(groups, method)()
|
||||
|
||||
if method == "first":
|
||||
values = [1, 3]
|
||||
else:
|
||||
values = [2, 3]
|
||||
|
||||
values = np.array(values, dtype=result["b"].dtype)
|
||||
idx = Index([1, 2], name="a")
|
||||
expected = DataFrame({"b": values}, index=idx)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("index", [0, -1])
|
||||
def test_nth_with_na_object(index, nulls_fixture):
|
||||
# https://github.com/pandas-dev/pandas/issues/32123
|
||||
df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
|
||||
groups = df.groupby("a")
|
||||
result = groups.nth(index)
|
||||
expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
def test_first_last_with_None(method):
|
||||
# https://github.com/pandas-dev/pandas/issues/32800
|
||||
# None should be preserved as object dtype
|
||||
df = DataFrame.from_dict({"id": ["a"], "value": [None]})
|
||||
groups = df.groupby("id", as_index=False)
|
||||
result = getattr(groups, method)()
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last"])
|
||||
@pytest.mark.parametrize(
|
||||
"df, expected",
|
||||
[
|
||||
(
|
||||
DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
|
||||
DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
(
|
||||
DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
|
||||
DataFrame({"value": [None]}, index=Index(["a"], name="id")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_with_None_expanded(method, df, expected):
|
||||
# GH 32800, 38286
|
||||
result = getattr(df.groupby("id"), method)()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes():
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
|
||||
"B": ["one", "one", "two", "three", "two", "two", "one", "three"],
|
||||
"C": np.random.default_rng(2).standard_normal(8),
|
||||
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
|
||||
}
|
||||
)
|
||||
df["E"] = True
|
||||
df["F"] = 1
|
||||
|
||||
# tests for first / last / nth
|
||||
grouped = df.groupby("A")
|
||||
first = grouped.first()
|
||||
expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(first, expected)
|
||||
|
||||
last = grouped.last()
|
||||
expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
|
||||
expected.index = Index(["bar", "foo"], name="A")
|
||||
expected = expected.sort_index()
|
||||
tm.assert_frame_equal(last, expected)
|
||||
|
||||
nth = grouped.nth(1)
|
||||
expected = df.iloc[[2, 3]]
|
||||
tm.assert_frame_equal(nth, expected)
|
||||
|
||||
|
||||
def test_first_last_nth_dtypes2():
|
||||
# GH 2763, first/last shifting dtypes
|
||||
idx = list(range(10))
|
||||
idx.append(9)
|
||||
ser = Series(data=range(11), index=idx, name="IntCol")
|
||||
assert ser.dtype == "int64"
|
||||
f = ser.groupby(level=0).first()
|
||||
assert f.dtype == "int64"
|
||||
|
||||
|
||||
def test_first_last_nth_nan_dtype():
|
||||
# GH 33591
|
||||
df = DataFrame({"data": ["A"], "nans": Series([None], dtype=object)})
|
||||
grouped = df.groupby("data")
|
||||
|
||||
expected = df.set_index("data").nans
|
||||
tm.assert_series_equal(grouped.nans.first(), expected)
|
||||
tm.assert_series_equal(grouped.nans.last(), expected)
|
||||
|
||||
expected = df.nans
|
||||
tm.assert_series_equal(grouped.nans.nth(-1), expected)
|
||||
tm.assert_series_equal(grouped.nans.nth(0), expected)
|
||||
|
||||
|
||||
def test_first_strings_timestamps():
|
||||
# GH 11244
|
||||
test = DataFrame(
|
||||
{
|
||||
Timestamp("2012-01-01 00:00:00"): ["a", "b"],
|
||||
Timestamp("2012-01-02 00:00:00"): ["c", "d"],
|
||||
"name": ["e", "e"],
|
||||
"aaaa": ["f", "g"],
|
||||
}
|
||||
)
|
||||
result = test.groupby("name").first()
|
||||
expected = DataFrame(
|
||||
[["a", "c", "f"]],
|
||||
columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
|
||||
index=Index(["e"], name="name"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth():
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 2]])
|
||||
tm.assert_frame_equal(gb.nth(1), df.iloc[[1]])
|
||||
tm.assert_frame_equal(gb.nth(2), df.loc[[]])
|
||||
tm.assert_frame_equal(gb.nth(-1), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-2), df.iloc[[0]])
|
||||
tm.assert_frame_equal(gb.nth(-3), df.loc[[]])
|
||||
tm.assert_series_equal(gb.B.nth(0), df.B.iloc[[0, 2]])
|
||||
tm.assert_series_equal(gb.B.nth(1), df.B.iloc[[1]])
|
||||
tm.assert_frame_equal(gb[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0, dropna="any"), df.iloc[[1, 2]])
|
||||
tm.assert_frame_equal(gb.nth(-1, dropna="any"), df.iloc[[1, 2]])
|
||||
|
||||
tm.assert_frame_equal(gb.nth(7, dropna="any"), df.iloc[:0])
|
||||
tm.assert_frame_equal(gb.nth(2, dropna="any"), df.iloc[:0])
|
||||
|
||||
|
||||
def test_nth2():
|
||||
# out of bounds, regression from 0.13.1
|
||||
# GH 6621
|
||||
df = DataFrame(
|
||||
{
|
||||
"color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
|
||||
"food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
|
||||
"two": {
|
||||
0: 1.5456590000000001,
|
||||
1: -0.070345000000000005,
|
||||
2: -2.4004539999999999,
|
||||
3: 0.46206000000000003,
|
||||
4: 0.52350799999999997,
|
||||
},
|
||||
"one": {
|
||||
0: 0.56573799999999996,
|
||||
1: -0.9742360000000001,
|
||||
2: 1.033801,
|
||||
3: -0.78543499999999999,
|
||||
4: 0.70422799999999997,
|
||||
},
|
||||
}
|
||||
).set_index(["color", "food"])
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(2)
|
||||
expected = df.iloc[[-1]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(level=0, as_index=False).nth(3)
|
||||
expected = df.loc[[]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth3():
|
||||
# GH 7559
|
||||
# from the vbench
|
||||
df = DataFrame(np.random.default_rng(2).integers(1, 10, (100, 2)), dtype="int64")
|
||||
ser = df[1]
|
||||
gb = df[0]
|
||||
expected = ser.groupby(gb).first()
|
||||
expected2 = ser.groupby(gb).apply(lambda x: x.iloc[0])
|
||||
tm.assert_series_equal(expected2, expected, check_names=False)
|
||||
assert expected.name == 1
|
||||
assert expected2.name == 1
|
||||
|
||||
# validate first
|
||||
v = ser[gb == 1].iloc[0]
|
||||
assert expected.iloc[0] == v
|
||||
assert expected2.iloc[0] == v
|
||||
|
||||
with pytest.raises(ValueError, match="For a DataFrame"):
|
||||
ser.groupby(gb, sort=False).nth(0, dropna=True)
|
||||
|
||||
|
||||
def test_nth4():
|
||||
# doc example
|
||||
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
result = gb.B.nth(0, dropna="all")
|
||||
expected = df.B.iloc[[1, 2]]
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth5():
|
||||
# test multiple nth values
|
||||
df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
|
||||
gb = df.groupby("A")
|
||||
|
||||
tm.assert_frame_equal(gb.nth(0), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0]), df.iloc[[0, 3]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
|
||||
tm.assert_frame_equal(gb.nth([2]), df.iloc[[2]])
|
||||
tm.assert_frame_equal(gb.nth([3, 4]), df.loc[[]])
|
||||
|
||||
|
||||
def test_nth_bdays(unit):
|
||||
business_dates = pd.date_range(
|
||||
start="4/1/2014", end="6/30/2014", freq="B", unit=unit
|
||||
)
|
||||
df = DataFrame(1, index=business_dates, columns=["a", "b"])
|
||||
# get the first, fourth and last two business days for each month
|
||||
key = [df.index.year, df.index.month]
|
||||
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
|
||||
expected_dates = pd.to_datetime(
|
||||
[
|
||||
"2014/4/1",
|
||||
"2014/4/4",
|
||||
"2014/4/29",
|
||||
"2014/4/30",
|
||||
"2014/5/1",
|
||||
"2014/5/6",
|
||||
"2014/5/29",
|
||||
"2014/5/30",
|
||||
"2014/6/2",
|
||||
"2014/6/5",
|
||||
"2014/6/27",
|
||||
"2014/6/30",
|
||||
]
|
||||
).as_unit(unit)
|
||||
expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_multi_grouper(three_group):
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on multiple groupers
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected_first, expected_last",
|
||||
[
|
||||
(
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
{
|
||||
"id": ["A"],
|
||||
"time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
"foo": [1],
|
||||
},
|
||||
),
|
||||
(
|
||||
{
|
||||
"id": ["A", "B", "A"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
],
|
||||
"foo": [1, 2, 3],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [1, 2],
|
||||
},
|
||||
{
|
||||
"id": ["A", "B"],
|
||||
"time": [
|
||||
Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
|
||||
Timestamp("2012-02-01 14:00:00", tz="US/Central"),
|
||||
],
|
||||
"foo": [3, 2],
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_first_last_tz(data, expected_first, expected_last):
|
||||
# GH15884
|
||||
# Test that the timezone is retained when calling first
|
||||
# or last on groupby with as_index=False
|
||||
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.groupby("id", as_index=False).first()
|
||||
expected = DataFrame(expected_first)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].first()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
result = df.groupby("id", as_index=False).last()
|
||||
expected = DataFrame(expected_last)
|
||||
cols = ["id", "time", "foo"]
|
||||
tm.assert_frame_equal(result[cols], expected[cols])
|
||||
|
||||
result = df.groupby("id", as_index=False)["time"].last()
|
||||
tm.assert_frame_equal(result, expected[["id", "time"]])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, ts, alpha",
|
||||
[
|
||||
["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
|
||||
["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
|
||||
],
|
||||
)
|
||||
def test_first_last_tz_multi_column(method, ts, alpha, unit):
|
||||
# GH 21603
|
||||
category_string = Series(list("abc")).astype("category")
|
||||
dti = pd.date_range("20130101", periods=3, tz="US/Eastern", unit=unit)
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": [1, 1, 2],
|
||||
"category_string": category_string,
|
||||
"datetimetz": dti,
|
||||
}
|
||||
)
|
||||
result = getattr(df.groupby("group"), method)()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"category_string": pd.Categorical(
|
||||
[alpha, "c"], dtype=category_string.dtype
|
||||
),
|
||||
"datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
expected["datetimetz"] = expected["datetimetz"].dt.as_unit(unit)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([True, False], dtype="boolean"),
|
||||
pd.array([1, 2], dtype="Int64"),
|
||||
pd.to_datetime(["2020-01-01", "2020-02-01"]),
|
||||
pd.to_timedelta([1, 2], unit="D"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("function", ["first", "last", "min", "max"])
|
||||
def test_first_last_extension_array_keeps_dtype(values, function):
|
||||
# https://github.com/pandas-dev/pandas/issues/33071
|
||||
# https://github.com/pandas-dev/pandas/issues/32194
|
||||
df = DataFrame({"a": [1, 2], "b": values})
|
||||
grouped = df.groupby("a")
|
||||
idx = Index([1, 2], name="a")
|
||||
expected_series = Series(values, name="b", index=idx)
|
||||
expected_frame = DataFrame({"b": values}, index=idx)
|
||||
|
||||
result_series = getattr(grouped["b"], function)()
|
||||
tm.assert_series_equal(result_series, expected_series)
|
||||
|
||||
result_frame = grouped.agg({"b": function})
|
||||
tm.assert_frame_equal(result_frame, expected_frame)
|
||||
|
||||
|
||||
def test_nth_multi_index_as_expected():
|
||||
# PR 9090, related to issue 8979
|
||||
# test nth on MultiIndex
|
||||
three_group = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"bar",
|
||||
"foo",
|
||||
"foo",
|
||||
"foo",
|
||||
],
|
||||
"B": [
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"one",
|
||||
"one",
|
||||
"one",
|
||||
"two",
|
||||
"two",
|
||||
"two",
|
||||
"one",
|
||||
],
|
||||
"C": [
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"dull",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"dull",
|
||||
"shiny",
|
||||
"shiny",
|
||||
"shiny",
|
||||
],
|
||||
}
|
||||
)
|
||||
grouped = three_group.groupby(["A", "B"])
|
||||
result = grouped.nth(0)
|
||||
expected = three_group.iloc[[0, 3, 4, 7]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_rows",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
g = df.groupby("A", as_index=as_index)
|
||||
expected = df.iloc[expected_rows]
|
||||
if columns is not None:
|
||||
g = g[columns]
|
||||
expected = expected[columns]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"op, n, expected_cols",
|
||||
[
|
||||
("head", -1, [0]),
|
||||
("head", 0, []),
|
||||
("head", 1, [0, 2]),
|
||||
("head", 7, [0, 1, 2]),
|
||||
("tail", -1, [1]),
|
||||
("tail", 0, []),
|
||||
("tail", 1, [1, 2]),
|
||||
("tail", 7, [0, 1, 2]),
|
||||
],
|
||||
)
|
||||
def test_groupby_head_tail_axis_1(op, n, expected_cols):
|
||||
# GH 9772
|
||||
df = DataFrame(
|
||||
[[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
g = df.groupby([0, 0, 1], axis=1)
|
||||
expected = df.iloc[:, expected_cols]
|
||||
result = getattr(g, op)(n)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_group_selection_cache():
|
||||
# GH 12839 nth, head, and tail should return same result consistently
|
||||
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
|
||||
expected = df.iloc[[0, 2]]
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.head(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.tail(n=2)
|
||||
result2 = g.nth(0)
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, expected)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.head(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
g = df.groupby("A")
|
||||
result1 = g.nth(0)
|
||||
result2 = g.tail(n=2)
|
||||
tm.assert_frame_equal(result1, expected)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
|
||||
def test_nth_empty():
|
||||
# GH 16064
|
||||
df = DataFrame(index=[0], columns=["a", "b", "c"])
|
||||
result = df.groupby("a").nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["a", "b"]).nth(10)
|
||||
expected = df.iloc[:0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_nth_column_order():
|
||||
# GH 20760
|
||||
# Check that nth preserves column order
|
||||
df = DataFrame(
|
||||
[[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
|
||||
columns=["A", "C", "B"],
|
||||
)
|
||||
result = df.groupby("A").nth(0)
|
||||
expected = df.iloc[[0, 3]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("A").nth(-1, dropna="any")
|
||||
expected = df.iloc[[1, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper(dropna):
|
||||
# GH 26011
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
"c": [1, 3, 5, 7, 9],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a").nth(0, dropna=dropna)
|
||||
expected = df.iloc[[1, 3]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dropna", [None, "any", "all"])
|
||||
def test_nth_nan_in_grouper_series(dropna):
|
||||
# GH 26454
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [np.nan, "a", np.nan, "b", np.nan],
|
||||
"b": [0, 2, 4, 6, 8],
|
||||
}
|
||||
)
|
||||
result = df.groupby("a")["b"].nth(0, dropna=dropna)
|
||||
expected = df["b"].iloc[[1, 3]]
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_categorical_and_datetime_data_nat():
|
||||
# GH 20520
|
||||
df = DataFrame(
|
||||
{
|
||||
"group": ["first", "first", "second", "third", "third"],
|
||||
"time": 5 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
|
||||
}
|
||||
)
|
||||
result = df.groupby("group").first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"time": 3 * [np.datetime64("NaT")],
|
||||
"categories": Series(["a", "c", "a"]).astype(
|
||||
pd.CategoricalDtype(["a", "b", "c"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = Index(["first", "second", "third"], name="group")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_first_multi_key_groupby_categorical():
|
||||
# GH 22512
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, 2, 2],
|
||||
"B": [100, 100, 200, 100, 100],
|
||||
"C": ["apple", "orange", "mango", "mango", "orange"],
|
||||
"D": ["jupiter", "mercury", "mars", "venus", "venus"],
|
||||
}
|
||||
)
|
||||
df = df.astype({"D": "category"})
|
||||
result = df.groupby(by=["A", "B"]).first()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"C": ["apple", "mango", "mango"],
|
||||
"D": Series(["jupiter", "mars", "venus"]).astype(
|
||||
pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
|
||||
),
|
||||
}
|
||||
)
|
||||
expected.index = MultiIndex.from_tuples(
|
||||
[(1, 100), (1, 200), (2, 100)], names=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["first", "last", "nth"])
|
||||
def test_groupby_last_first_nth_with_none(method, nulls_fixture):
|
||||
# GH29645
|
||||
expected = Series(["y"])
|
||||
data = Series(
|
||||
[nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
|
||||
index=[0, 0, 0, 0, 0],
|
||||
).groupby(level=0)
|
||||
|
||||
if method == "nth":
|
||||
result = getattr(data, method)(3)
|
||||
else:
|
||||
result = getattr(data, method)()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"arg, expected_rows",
|
||||
[
|
||||
[slice(None, 3, 2), [0, 1, 4, 5]],
|
||||
[slice(None, -2), [0, 2, 5]],
|
||||
[[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
[[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
|
||||
],
|
||||
)
|
||||
def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
|
||||
# Test slices GH #42947
|
||||
|
||||
result = slice_test_grouped.nth[arg]
|
||||
equivalent = slice_test_grouped.nth(arg)
|
||||
expected = slice_test_df.iloc[expected_rows]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_nth_indexed(slice_test_df, slice_test_grouped):
|
||||
# Test index notation GH #44688
|
||||
|
||||
result = slice_test_grouped.nth[0, 1, -2:]
|
||||
equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(equivalent, expected)
|
||||
|
||||
|
||||
def test_invalid_argument(slice_test_grouped):
|
||||
# Test for error on invalid argument
|
||||
|
||||
with pytest.raises(TypeError, match="Invalid index"):
|
||||
slice_test_grouped.nth(3.14)
|
||||
|
||||
|
||||
def test_negative_step(slice_test_grouped):
|
||||
# Test for error on negative slice step
|
||||
|
||||
with pytest.raises(ValueError, match="Invalid step"):
|
||||
slice_test_grouped.nth(slice(None, None, -1))
|
||||
|
||||
|
||||
def test_np_ints(slice_test_df, slice_test_grouped):
|
||||
# Test np ints work
|
||||
|
||||
result = slice_test_grouped.nth(np.array([0, 1]))
|
||||
expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_with_column_axis():
|
||||
# GH43926
|
||||
df = DataFrame(
|
||||
[
|
||||
[4, 5, 6],
|
||||
[8, 8, 7],
|
||||
],
|
||||
index=["z", "y"],
|
||||
columns=["C", "B", "A"],
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(df.iloc[1], axis=1)
|
||||
result = gb.nth(0)
|
||||
expected = df.iloc[:, [0, 2]]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_nth_interval():
|
||||
# GH#24205
|
||||
idx_result = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 0, 1, 1], [0, 1, 1, 0, -1]],
|
||||
)
|
||||
df_result = DataFrame({"col": range(len(idx_result))}, index=idx_result)
|
||||
result = df_result.groupby(level=[0, 1], observed=False).nth(0)
|
||||
val_expected = [0, 1, 3]
|
||||
idx_expected = MultiIndex(
|
||||
[
|
||||
pd.CategoricalIndex([pd.Interval(0, 1), pd.Interval(1, 2)]),
|
||||
pd.CategoricalIndex([pd.Interval(0, 10), pd.Interval(10, 20)]),
|
||||
],
|
||||
[[0, 0, 1], [0, 1, 0]],
|
||||
)
|
||||
expected = DataFrame(val_expected, index=idx_expected, columns=["col"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, expected_values, expected_columns",
|
||||
[
|
||||
(None, None, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, 1, [0, 3], list("AD")),
|
||||
(None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
|
||||
(None, -1, [0, 1, 3], list("ABD")),
|
||||
(1, None, [1, 2, 4], list("BCE")),
|
||||
(1, -1, [1], list("B")),
|
||||
(-1, None, [2, 4], list("CE")),
|
||||
(-1, 2, [4], list("E")),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["call", "index"])
|
||||
def test_nth_slices_with_column_axis(
|
||||
start, stop, expected_values, expected_columns, method
|
||||
):
|
||||
df = DataFrame([range(5)], columns=[list("ABCDE")])
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby([5, 5, 5, 6, 6], axis=1)
|
||||
result = {
|
||||
"call": lambda start, stop: gb.nth(slice(start, stop)),
|
||||
"index": lambda start, stop: gb.nth[start:stop],
|
||||
}[method](start, stop)
|
||||
expected = DataFrame([expected_values], columns=[expected_columns])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:invalid value encountered in remainder:RuntimeWarning"
|
||||
)
|
||||
def test_head_tail_dropna_true():
|
||||
# GH#45089
|
||||
df = DataFrame(
|
||||
[["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
|
||||
)
|
||||
expected = DataFrame([["a", "z"]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"]).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"]).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_head_tail_dropna_false():
|
||||
# GH#45089
|
||||
df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).head(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("selection", ("b", ["b"], ["b", "c"]))
|
||||
@pytest.mark.parametrize("dropna", ["any", "all", None])
|
||||
def test_nth_after_selection(selection, dropna):
|
||||
# GH#11038, GH#53518
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": [1, 1, 2],
|
||||
"b": [np.nan, 3, 4],
|
||||
"c": [5, 6, 7],
|
||||
}
|
||||
)
|
||||
gb = df.groupby("a")[selection]
|
||||
result = gb.nth(0, dropna=dropna)
|
||||
if dropna == "any" or (dropna == "all" and selection != ["b", "c"]):
|
||||
locs = [1, 2]
|
||||
else:
|
||||
locs = [0, 2]
|
||||
expected = df.loc[locs, selection]
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
(
|
||||
Timestamp("2011-01-15 12:50:28.502376"),
|
||||
Timestamp("2011-01-20 12:50:28.593448"),
|
||||
),
|
||||
(24650000000000001, 24650000000000002),
|
||||
],
|
||||
)
|
||||
def test_groupby_nth_int_like_precision(data):
|
||||
# GH#6620, GH#9311
|
||||
df = DataFrame({"a": [1, 1], "b": data})
|
||||
|
||||
grouped = df.groupby("a")
|
||||
result = grouped.nth(0)
|
||||
expected = DataFrame({"a": 1, "b": [data[0]]})
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,496 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"a_vals,b_vals",
|
||||
[
|
||||
# Ints
|
||||
([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
|
||||
([1, 2, 3, 4], [4, 3, 2, 1]),
|
||||
([1, 2, 3, 4, 5], [4, 3, 2, 1]),
|
||||
# Floats
|
||||
([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
|
||||
# Missing data
|
||||
([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
|
||||
([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
|
||||
# Timestamps
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1],
|
||||
),
|
||||
(
|
||||
pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
|
||||
pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
|
||||
),
|
||||
# All NA
|
||||
([np.nan] * 5, [np.nan] * 5),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
|
||||
def test_quantile(interpolation, a_vals, b_vals, q, request):
|
||||
if (
|
||||
interpolation == "nearest"
|
||||
and q == 0.5
|
||||
and isinstance(b_vals, list)
|
||||
and b_vals == [4, 3, 2, 1]
|
||||
):
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
reason="Unclear numpy expectation for nearest "
|
||||
"result with equidistant data"
|
||||
)
|
||||
)
|
||||
all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
|
||||
|
||||
a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
|
||||
b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
|
||||
|
||||
df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
|
||||
|
||||
expected = DataFrame(
|
||||
[a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
|
||||
)
|
||||
if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
|
||||
# TODO(non-nano): this should be unnecessary once array_to_datetime
|
||||
# correctly infers non-nano from Timestamp.unit
|
||||
expected = expected.astype(all_vals.dtype)
|
||||
result = df.groupby("key").quantile(q, interpolation=interpolation)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array():
|
||||
# https://github.com/pandas-dev/pandas/issues/27526
|
||||
df = DataFrame({"A": [0, 1, 2, 3, 4]})
|
||||
key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25])
|
||||
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25]])
|
||||
expected = DataFrame({"A": [0.25, 2.50]}, index=index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
|
||||
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
|
||||
|
||||
key = np.array([0, 0, 1, 1], dtype=np.int64)
|
||||
result = df.groupby(key).quantile([0.25, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array2():
|
||||
# https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
|
||||
arr = np.random.default_rng(2).integers(0, 5, size=(10, 3), dtype=np.int64)
|
||||
df = DataFrame(arr, columns=list("ABC"))
|
||||
result = df.groupby("A").quantile([0.3, 0.7])
|
||||
expected = DataFrame(
|
||||
{
|
||||
"B": [2.0, 2.0, 2.3, 2.7, 0.3, 0.7, 3.2, 4.0, 0.3, 0.7],
|
||||
"C": [1.0, 1.0, 1.9, 3.0999999999999996, 0.3, 0.7, 2.6, 3.0, 1.2, 2.8],
|
||||
},
|
||||
index=pd.MultiIndex.from_product(
|
||||
[[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_no_sort():
|
||||
df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
|
||||
key = np.array([1, 0, 1], dtype=np.int64)
|
||||
result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
|
||||
expected = DataFrame(
|
||||
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby(key, sort=False).quantile([0.75, 0.25])
|
||||
expected = DataFrame(
|
||||
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
|
||||
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_array_multiple_levels():
|
||||
df = DataFrame(
|
||||
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
|
||||
)
|
||||
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
|
||||
index = pd.MultiIndex.from_tuples(
|
||||
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
|
||||
names=["c", "d", None],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
|
||||
@pytest.mark.parametrize("groupby", [[0], [0, 1]])
|
||||
@pytest.mark.parametrize("q", [[0.5, 0.6]])
|
||||
def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
|
||||
# GH30289
|
||||
nrow, ncol = frame_size
|
||||
df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
|
||||
|
||||
idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
|
||||
idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
|
||||
list(range(len(q))) * min(nrow, 4)
|
||||
]
|
||||
expected_index = pd.MultiIndex(
|
||||
levels=idx_levels, codes=idx_codes, names=groupby + [None]
|
||||
)
|
||||
expected_values = [
|
||||
[float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
|
||||
]
|
||||
expected_columns = [x for x in range(ncol) if x not in groupby]
|
||||
expected = DataFrame(
|
||||
expected_values, index=expected_index, columns=expected_columns
|
||||
)
|
||||
result = df.groupby(groupby).quantile(q)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_quantile_raises():
|
||||
df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
|
||||
|
||||
with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
|
||||
df.groupby("key").quantile()
|
||||
|
||||
|
||||
def test_quantile_out_of_bounds_q_raises():
|
||||
# https://github.com/pandas-dev/pandas/issues/27470
|
||||
df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
|
||||
g = df.groupby([0, 0, 0, 1, 1, 1])
|
||||
with pytest.raises(ValueError, match="Got '50.0' instead"):
|
||||
g.quantile(50)
|
||||
|
||||
with pytest.raises(ValueError, match="Got '-1.0' instead"):
|
||||
g.quantile(-1)
|
||||
|
||||
|
||||
def test_quantile_missing_group_values_no_segfaults():
|
||||
# GH 28662
|
||||
data = np.array([1.0, np.nan, 1.0])
|
||||
df = DataFrame({"key": data, "val": range(3)})
|
||||
|
||||
# Random segfaults; would have been guaranteed in loop
|
||||
grp = df.groupby("key")
|
||||
for _ in range(100):
|
||||
grp.quantile()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"key, val, expected_key, expected_val",
|
||||
[
|
||||
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
|
||||
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
|
||||
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
|
||||
([0], [42], [0], [42.0]),
|
||||
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
|
||||
],
|
||||
)
|
||||
def test_quantile_missing_group_values_correct_results(
|
||||
key, val, expected_key, expected_val
|
||||
):
|
||||
# GH 28662, GH 33200, GH 33569
|
||||
df = DataFrame({"key": key, "val": val})
|
||||
|
||||
expected = DataFrame(
|
||||
expected_val, index=Index(expected_key, name="key"), columns=["val"]
|
||||
)
|
||||
|
||||
grp = df.groupby("key")
|
||||
|
||||
result = grp.quantile(0.5)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = grp.quantile()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values",
|
||||
[
|
||||
pd.array([1, 0, None] * 2, dtype="Int64"),
|
||||
pd.array([True, False, None] * 2, dtype="boolean"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
def test_groupby_quantile_nullable_array(values, q):
|
||||
# https://github.com/pandas-dev/pandas/issues/33136
|
||||
df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
|
||||
result = df.groupby("a")["b"].quantile(q)
|
||||
|
||||
if isinstance(q, list):
|
||||
idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
|
||||
true_quantiles = [0.0, 0.5, 1.0]
|
||||
else:
|
||||
idx = Index(["x", "y"], name="a")
|
||||
true_quantiles = [0.5]
|
||||
|
||||
expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
|
||||
df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
|
||||
if numeric_only:
|
||||
result = df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
expected = df.groupby("a")[["b"]].quantile(q)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(
|
||||
TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
|
||||
):
|
||||
df.groupby("a").quantile(q, numeric_only=numeric_only)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_float(any_float_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
exp_index = Index([1.0], dtype=any_float_dtype, name="x")
|
||||
|
||||
if any_float_dtype in ["Float32", "Float64"]:
|
||||
expected_dtype = any_float_dtype
|
||||
else:
|
||||
expected_dtype = None
|
||||
|
||||
expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby("x")["y"].quantile([0.5, 0.75])
|
||||
expected = pd.Series(
|
||||
[0.2] * 2,
|
||||
index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
|
||||
name="y",
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_NA_int(any_int_ea_dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[3.5],
|
||||
dtype="Float64",
|
||||
index=Index([1], name="x", dtype=any_int_ea_dtype),
|
||||
name="y",
|
||||
)
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
result = df.groupby("x").quantile(0.5)
|
||||
expected = DataFrame(
|
||||
{"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
|
||||
)
|
||||
def test_groupby_quantile_all_na_group_masked(
|
||||
interpolation, val1, val2, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
|
||||
expected = DataFrame(
|
||||
{"b": [val1, val2, pd.NA, pd.NA]},
|
||||
dtype=any_numeric_ea_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
|
||||
def test_groupby_quantile_all_na_group_masked_interp(
|
||||
interpolation, any_numeric_ea_dtype
|
||||
):
|
||||
# GH#37493
|
||||
df = DataFrame(
|
||||
{"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
|
||||
)
|
||||
result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
|
||||
|
||||
if any_numeric_ea_dtype == "Float32":
|
||||
expected_dtype = any_numeric_ea_dtype
|
||||
else:
|
||||
expected_dtype = "Float64"
|
||||
|
||||
expected = DataFrame(
|
||||
{"b": [2.0, 2.5, pd.NA, pd.NA]},
|
||||
dtype=expected_dtype,
|
||||
index=pd.MultiIndex.from_arrays(
|
||||
[
|
||||
pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
|
||||
[0.5, 0.75, 0.5, 0.75],
|
||||
],
|
||||
names=["a", None],
|
||||
),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Float64", "Float32"])
|
||||
def test_groupby_quantile_allNA_column(dtype):
|
||||
# GH#42849
|
||||
df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
|
||||
result = df.groupby("x")["y"].quantile(0.5)
|
||||
expected = pd.Series(
|
||||
[np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
|
||||
)
|
||||
expected.index.name = "x"
|
||||
tm.assert_series_equal(expected, result)
|
||||
|
||||
|
||||
def test_groupby_timedelta_quantile():
|
||||
# GH: 29485
|
||||
df = DataFrame(
|
||||
{"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
|
||||
)
|
||||
result = df.groupby("group").quantile(0.99)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"value": [
|
||||
pd.Timedelta("0 days 00:00:00.990000"),
|
||||
pd.Timedelta("0 days 00:00:02.990000"),
|
||||
]
|
||||
},
|
||||
index=Index([1, 2], name="group"),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_columns_groupby_quantile():
|
||||
# GH 33795
|
||||
df = DataFrame(
|
||||
np.arange(12).reshape(3, -1),
|
||||
index=list("XYZ"),
|
||||
columns=pd.Series(list("ABAB"), name="col"),
|
||||
)
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby("col", axis=1)
|
||||
result = gb.quantile(q=[0.8, 0.2])
|
||||
expected = DataFrame(
|
||||
[
|
||||
[1.6, 0.4, 2.6, 1.4],
|
||||
[5.6, 4.4, 6.6, 5.4],
|
||||
[9.6, 8.4, 10.6, 9.4],
|
||||
],
|
||||
index=list("XYZ"),
|
||||
columns=pd.MultiIndex.from_tuples(
|
||||
[("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
|
||||
),
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_timestamp_groupby_quantile(unit):
|
||||
# GH 33168
|
||||
dti = pd.date_range(
|
||||
start="2020-04-19 00:00:00", freq="1min", periods=100, tz="UTC", unit=unit
|
||||
).floor("1h")
|
||||
df = DataFrame(
|
||||
{
|
||||
"timestamp": dti,
|
||||
"category": list(range(1, 101)),
|
||||
"value": list(range(101, 201)),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("timestamp").quantile([0.2, 0.8])
|
||||
|
||||
mi = pd.MultiIndex.from_product([dti[::99], [0.2, 0.8]], names=("timestamp", None))
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"category": 12.8, "value": 112.8},
|
||||
{"category": 48.2, "value": 148.2},
|
||||
{"category": 68.8, "value": 168.8},
|
||||
{"category": 92.2, "value": 192.2},
|
||||
],
|
||||
index=mi,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_dt64tz_period():
|
||||
# GH#51373
|
||||
dti = pd.date_range("2016-01-01", periods=1000)
|
||||
df = pd.Series(dti).to_frame().copy()
|
||||
df[1] = dti.tz_localize("US/Pacific")
|
||||
df[2] = dti.to_period("D")
|
||||
df[3] = dti - dti[0]
|
||||
df.iloc[-1] = pd.NaT
|
||||
|
||||
by = np.tile(np.arange(5), 200)
|
||||
gb = df.groupby(by)
|
||||
|
||||
result = gb.quantile(0.5)
|
||||
|
||||
# Check that we match the group-by-group result
|
||||
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
|
||||
expected = DataFrame(exp).T.infer_objects()
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_quantile_nonmulti_levels_order():
|
||||
# Non-regression test for GH #53009
|
||||
ind = pd.MultiIndex.from_tuples(
|
||||
[
|
||||
(0, "a", "B"),
|
||||
(0, "a", "A"),
|
||||
(0, "b", "B"),
|
||||
(0, "b", "A"),
|
||||
(1, "a", "B"),
|
||||
(1, "a", "A"),
|
||||
(1, "b", "B"),
|
||||
(1, "b", "A"),
|
||||
],
|
||||
names=["sample", "cat0", "cat1"],
|
||||
)
|
||||
ser = pd.Series(range(8), index=ind)
|
||||
result = ser.groupby(level="cat1", sort=False).quantile([0.2, 0.8])
|
||||
|
||||
qind = pd.MultiIndex.from_tuples(
|
||||
[("B", 0.2), ("B", 0.8), ("A", 0.2), ("A", 0.8)], names=["cat1", None]
|
||||
)
|
||||
expected = pd.Series([1.2, 4.8, 2.2, 5.8], index=qind)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# We need to check that index levels are not sorted
|
||||
expected_levels = pd.core.indexes.frozen.FrozenList([["B", "A"], [0.2, 0.8]])
|
||||
tm.assert_equal(result.index.levels, expected_levels)
|
@ -0,0 +1,721 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
NaT,
|
||||
Series,
|
||||
concat,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_rank_unordered_categorical_typeerror():
|
||||
# GH#51034 should be TypeError, not NotImplementedError
|
||||
cat = pd.Categorical([], ordered=False)
|
||||
ser = Series(cat)
|
||||
df = ser.to_frame()
|
||||
|
||||
msg = "Cannot perform rank with non-ordered Categorical"
|
||||
|
||||
gb = ser.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb.rank()
|
||||
|
||||
gb2 = df.groupby(cat, observed=False)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
gb2.rank()
|
||||
|
||||
|
||||
def test_rank_apply():
|
||||
lev1 = np.array(["a" * 10] * 100, dtype=object)
|
||||
lev2 = np.array(["b" * 10] * 130, dtype=object)
|
||||
lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
|
||||
lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"value": np.random.default_rng(2).standard_normal(500),
|
||||
"key1": lev1.take(lab1),
|
||||
"key2": lev2.take(lab2),
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank()
|
||||
|
||||
expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(["key1", "key2"]).value.rank(pct=True)
|
||||
|
||||
expected = [
|
||||
piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
|
||||
]
|
||||
expected = concat(expected, axis=0)
|
||||
expected = expected.reindex(result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, 8, 2, 6], dtype=dtype)
|
||||
for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,pct,exp",
|
||||
[
|
||||
("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
|
||||
("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
|
||||
("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
|
||||
("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
|
||||
("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
|
||||
("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
|
||||
("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
|
||||
("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
|
||||
("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
|
||||
("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
|
||||
("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
|
||||
("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
|
||||
("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
|
||||
("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
|
||||
("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
|
||||
("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
|
||||
("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
|
||||
("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
|
||||
("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,exp",
|
||||
[
|
||||
("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
|
||||
("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
|
||||
("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
|
||||
("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
|
||||
("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
|
||||
("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
|
||||
("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
|
||||
("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
|
||||
("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
|
||||
("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
|
||||
("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
|
||||
("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
|
||||
("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
|
||||
("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
|
||||
("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
|
||||
("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
|
||||
("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
|
||||
("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
|
||||
("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
|
||||
("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
|
||||
("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
|
||||
("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
|
||||
("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
|
||||
("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
|
||||
("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
|
||||
("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
|
||||
("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
|
||||
("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
|
||||
("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
|
||||
# GH 20561
|
||||
key = np.repeat(grps, len(vals))
|
||||
vals = vals * len(grps)
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option
|
||||
)
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
|
||||
for dtype in ["f8", "f4", "f2"]
|
||||
]
|
||||
+ [
|
||||
[
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08"),
|
||||
pd.Timestamp("2018-01-02"),
|
||||
pd.Timestamp("2018-01-06"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-02", tz="US/Pacific"),
|
||||
pd.Timestamp("2018-01-06", tz="US/Pacific"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-02") - pd.Timestamp(0),
|
||||
pd.Timestamp("2018-01-06") - pd.Timestamp(0),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
[
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
np.nan,
|
||||
pd.Timestamp("2018-01-08").to_period("D"),
|
||||
pd.Timestamp("2018-01-02").to_period("D"),
|
||||
pd.Timestamp("2018-01-06").to_period("D"),
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
],
|
||||
ids=lambda x: type(x[0]),
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"ties_method,ascending,na_option,pct,exp",
|
||||
[
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
|
||||
),
|
||||
("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
|
||||
("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
|
||||
("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
|
||||
("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
False,
|
||||
[1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
True,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
1.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
np.nan,
|
||||
3.0 / 3.0,
|
||||
1.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
False,
|
||||
[3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
|
||||
),
|
||||
(
|
||||
"dense",
|
||||
False,
|
||||
"keep",
|
||||
True,
|
||||
[
|
||||
3.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
np.nan,
|
||||
1.0 / 3.0,
|
||||
3.0 / 3.0,
|
||||
2.0 / 3.0,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
),
|
||||
("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
|
||||
),
|
||||
("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
|
||||
(
|
||||
"average",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
|
||||
),
|
||||
("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
|
||||
),
|
||||
("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
|
||||
(
|
||||
"min",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
|
||||
),
|
||||
("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
|
||||
("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
|
||||
("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
|
||||
(
|
||||
"max",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
|
||||
),
|
||||
("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
True,
|
||||
"bottom",
|
||||
True,
|
||||
[0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
|
||||
),
|
||||
("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
|
||||
(
|
||||
"first",
|
||||
False,
|
||||
"bottom",
|
||||
True,
|
||||
[0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
|
||||
),
|
||||
("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
|
||||
("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
|
||||
("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
|
||||
("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
|
||||
],
|
||||
)
|
||||
def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
|
||||
key = np.repeat(grps, len(vals))
|
||||
|
||||
orig_vals = vals
|
||||
vals = list(vals) * len(grps)
|
||||
if isinstance(orig_vals, np.ndarray):
|
||||
vals = np.array(vals, dtype=orig_vals.dtype)
|
||||
|
||||
df = DataFrame({"key": key, "val": vals})
|
||||
result = df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
exp_df = DataFrame(exp * len(grps), columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
|
||||
)
|
||||
def test_rank_resets_each_group(pct, exp):
|
||||
df = DataFrame(
|
||||
{"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
|
||||
)
|
||||
result = df.groupby("key").rank(pct=pct)
|
||||
exp_df = DataFrame(exp * 2, columns=["val"])
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
|
||||
)
|
||||
@pytest.mark.parametrize("upper", [True, False])
|
||||
def test_rank_avg_even_vals(dtype, upper):
|
||||
if upper:
|
||||
# use IntegerDtype/FloatingDtype
|
||||
dtype = dtype[0].upper() + dtype[1:]
|
||||
dtype = dtype.replace("Ui", "UI")
|
||||
df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
|
||||
df["val"] = df["val"].astype(dtype)
|
||||
assert df["val"].dtype == dtype
|
||||
|
||||
result = df.groupby("key").rank()
|
||||
exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
|
||||
if upper:
|
||||
exp_df = exp_df.astype("Float64")
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
|
||||
)
|
||||
def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
mask = df["val"].isna()
|
||||
|
||||
gb = df.groupby("key")
|
||||
res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
|
||||
|
||||
# construct our expected by using numeric values with the same ordering
|
||||
if mask.any():
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
|
||||
else:
|
||||
df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
|
||||
|
||||
gb2 = df2.groupby("key")
|
||||
alt = gb2.rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(res, alt)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", [True, "bad", 1])
|
||||
@pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
|
||||
@pytest.mark.parametrize("ascending", [True, False])
|
||||
@pytest.mark.parametrize("pct", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
["bar", "bar", "foo", "bar", "baz"],
|
||||
["bar", np.nan, "foo", np.nan, "baz"],
|
||||
[1, np.nan, 2, np.nan, 3],
|
||||
],
|
||||
)
|
||||
def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
|
||||
df = DataFrame({"key": ["foo"] * 5, "val": vals})
|
||||
msg = "na_option must be one of 'keep', 'top', or 'bottom'"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("key").rank(
|
||||
method=ties_method, ascending=ascending, na_option=na_option, pct=pct
|
||||
)
|
||||
|
||||
|
||||
def test_rank_empty_group():
|
||||
# see gh-22519
|
||||
column = "A"
|
||||
df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
|
||||
|
||||
result = df.groupby(column).B.rank(pct=True)
|
||||
expected = Series([0.5, np.nan, 1.0], name="B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.groupby(column).rank(pct=True)
|
||||
expected = DataFrame({"B": [0.5, np.nan, 1.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_key,input_value,output_value",
|
||||
[
|
||||
([1, 2], [1, 1], [1.0, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
|
||||
([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
|
||||
([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_rank_zero_div(input_key, input_value, output_value):
|
||||
# GH 23666
|
||||
df = DataFrame({"A": input_key, "B": input_value})
|
||||
|
||||
result = df.groupby("A").rank(method="dense", pct=True)
|
||||
expected = DataFrame({"B": output_value})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_min_int():
|
||||
# GH-32859
|
||||
df = DataFrame(
|
||||
{
|
||||
"grp": [1, 1, 2],
|
||||
"int_col": [
|
||||
np.iinfo(np.int64).min,
|
||||
np.iinfo(np.int64).max,
|
||||
np.iinfo(np.int64).min,
|
||||
],
|
||||
"datetimelike": [NaT, datetime(2001, 1, 1), NaT],
|
||||
}
|
||||
)
|
||||
|
||||
result = df.groupby("grp").rank()
|
||||
expected = DataFrame(
|
||||
{"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.nan, 1.0, np.nan]}
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_nan", [True, False])
|
||||
def test_rank_pct_equal_values_on_group_transition(use_nan):
|
||||
# GH#40518
|
||||
fill_value = np.nan if use_nan else 3
|
||||
df = DataFrame(
|
||||
[
|
||||
[-1, 1],
|
||||
[-1, 2],
|
||||
[1, fill_value],
|
||||
[-1, fill_value],
|
||||
],
|
||||
columns=["group", "val"],
|
||||
)
|
||||
result = df.groupby(["group"])["val"].rank(
|
||||
method="dense",
|
||||
pct=True,
|
||||
)
|
||||
if use_nan:
|
||||
expected = Series([0.5, 1, np.nan, np.nan], name="val")
|
||||
else:
|
||||
expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_multiindex():
|
||||
# GH27721
|
||||
df = concat(
|
||||
{
|
||||
"a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
|
||||
"b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
|
||||
},
|
||||
axis=1,
|
||||
)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=1)
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = gb.rank(axis=1)
|
||||
|
||||
expected = concat(
|
||||
[
|
||||
df["a"].rank(axis=1),
|
||||
df["b"].rank(axis=1),
|
||||
],
|
||||
axis=1,
|
||||
keys=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_rank_axis1():
|
||||
# GH#41320
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.rank with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
res = gb.rank(axis=1)
|
||||
|
||||
# This should match what we get when "manually" operating group-by-group
|
||||
expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
# check that we haven't accidentally written a case that coincidentally
|
||||
# matches rank(axis=0)
|
||||
msg = "The 'axis' keyword in DataFrameGroupBy.rank"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
alt = gb.rank(axis=0)
|
||||
assert not alt.equals(expected)
|
||||
|
||||
|
||||
def test_groupby_axis0_cummax_axis1():
|
||||
# case where groupby axis is 0 and axis keyword in transform is 1
|
||||
|
||||
# df has mixed dtype -> multiple blocks
|
||||
df = DataFrame(
|
||||
{0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
|
||||
index=["a", "a", "b", "b"],
|
||||
)
|
||||
msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
gb = df.groupby(level=0, axis=0)
|
||||
|
||||
msg = "DataFrameGroupBy.cummax with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
cmax = gb.cummax(axis=1)
|
||||
expected = df[[0, 1]].astype(np.float64)
|
||||
expected[2] = expected[1]
|
||||
tm.assert_frame_equal(cmax, expected)
|
||||
|
||||
|
||||
def test_non_unique_index():
|
||||
# GH 16577
|
||||
df = DataFrame(
|
||||
{"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
)
|
||||
result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
|
||||
expected = Series(
|
||||
[1.0, 1.0, 1.0, np.nan],
|
||||
index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
|
||||
name="value",
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rank_categorical():
|
||||
cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
|
||||
cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
|
||||
|
||||
df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
|
||||
|
||||
gb = df.groupby("col1")
|
||||
|
||||
res = gb.rank()
|
||||
|
||||
expected = df.astype(object).groupby("col1").rank()
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na_option", ["top", "bottom"])
|
||||
def test_groupby_op_with_nullables(na_option):
|
||||
# GH 54206
|
||||
df = DataFrame({"x": [None]}, dtype="Float64")
|
||||
result = df.groupby("x", dropna=False)["x"].rank(method="min", na_option=na_option)
|
||||
expected = Series([1.0], dtype="Float64", name=result.name)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,154 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)])
|
||||
def test_groupby_sample_balanced_groups_shape(n, frac):
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=n, frac=frac)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=n, frac=frac)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_unbalanced_groups_shape():
|
||||
values = [1] * 10 + [2] * 20
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=5)
|
||||
values = [1] * 5 + [2] * 5
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=5)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_index_value_spans_groups():
|
||||
values = [1] * 3 + [2] * 3
|
||||
df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2])
|
||||
|
||||
result = df.groupby("a").sample(n=2)
|
||||
values = [1] * 2 + [2] * 2
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_n_and_frac_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Please enter a value for `frac` OR `n`, not both"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=1, frac=1.0)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=1, frac=1.0)
|
||||
|
||||
|
||||
def test_groupby_sample_frac_gt_one_without_replacement_raises():
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
msg = "Replace has to be set to `True` when upsampling the population `frac` > 1."
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(frac=1.5, replace=False)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(frac=1.5, replace=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [-1, 1.5])
|
||||
def test_groupby_sample_invalid_n_raises(n):
|
||||
df = DataFrame({"a": [1, 2], "b": [1, 2]})
|
||||
|
||||
if n < 0:
|
||||
msg = "A negative number of rows requested. Please provide `n` >= 0."
|
||||
else:
|
||||
msg = "Only integers accepted as `n` values"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a").sample(n=n)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.groupby("a")["b"].sample(n=n)
|
||||
|
||||
|
||||
def test_groupby_sample_oversample():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(frac=2.0, replace=True)
|
||||
values = [1] * 20 + [2] * 20
|
||||
expected = DataFrame({"a": values, "b": values}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(frac=2.0, replace=True)
|
||||
expected = Series(values, name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_without_n_or_frac():
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values})
|
||||
|
||||
result = df.groupby("a").sample(n=None, frac=None)
|
||||
expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=None, frac=None)
|
||||
expected = Series([1, 2], name="b", index=result.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index, expected_index",
|
||||
[(["w", "x", "y", "z"], ["w", "w", "y", "y"]), ([3, 4, 5, 6], [3, 3, 5, 5])],
|
||||
)
|
||||
def test_groupby_sample_with_weights(index, expected_index):
|
||||
# GH 39927 - tests for integer index needed
|
||||
values = [1] * 2 + [2] * 2
|
||||
df = DataFrame({"a": values, "b": values}, index=Index(index))
|
||||
|
||||
result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = DataFrame({"a": values, "b": values}, index=Index(expected_index))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0])
|
||||
expected = Series(values, name="b", index=Index(expected_index))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_selections():
|
||||
# GH 39928
|
||||
values = [1] * 10 + [2] * 10
|
||||
df = DataFrame({"a": values, "b": values, "c": values})
|
||||
|
||||
result = df.groupby("a")[["b", "c"]].sample(n=None, frac=None)
|
||||
expected = DataFrame({"b": [1, 2], "c": [1, 2]}, index=result.index)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_groupby_sample_with_empty_inputs():
|
||||
# GH48459
|
||||
df = DataFrame({"a": [], "b": []})
|
||||
groupby_df = df.groupby("a")
|
||||
|
||||
result = groupby_df.sample()
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,130 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas.core.dtypes.common import is_integer_dtype
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
PeriodIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
def test_size(df, by):
|
||||
grouped = df.groupby(by=by)
|
||||
result = grouped.size()
|
||||
for key, group in grouped:
|
||||
assert result[key] == len(group)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"by",
|
||||
[
|
||||
[0, 0, 0, 0],
|
||||
[0, 1, 1, 1],
|
||||
[1, 0, 1, 1],
|
||||
[0, None, None, None],
|
||||
pytest.param([None, None, None, None], marks=pytest.mark.xfail),
|
||||
],
|
||||
)
|
||||
def test_size_axis_1(df, axis_1, by, sort, dropna):
|
||||
# GH#45715
|
||||
counts = {key: sum(value == key for value in by) for key in dict.fromkeys(by)}
|
||||
if dropna:
|
||||
counts = {key: value for key, value in counts.items() if key is not None}
|
||||
expected = Series(counts, dtype="int64")
|
||||
if sort:
|
||||
expected = expected.sort_index()
|
||||
if is_integer_dtype(expected.index.dtype) and not any(x is None for x in by):
|
||||
expected.index = expected.index.astype(int)
|
||||
|
||||
msg = "DataFrame.groupby with axis=1 is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
grouped = df.groupby(by=by, axis=axis_1, sort=sort, dropna=dropna)
|
||||
result = grouped.size()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("by", ["A", "B", ["A", "B"]])
|
||||
@pytest.mark.parametrize("sort", [True, False])
|
||||
def test_size_sort(sort, by):
|
||||
df = DataFrame(np.random.default_rng(2).choice(20, (1000, 3)), columns=list("ABC"))
|
||||
left = df.groupby(by=by, sort=sort).size()
|
||||
right = df.groupby(by=by, sort=sort)["C"].apply(lambda a: a.shape[0])
|
||||
tm.assert_series_equal(left, right, check_names=False)
|
||||
|
||||
|
||||
def test_size_series_dataframe():
|
||||
# https://github.com/pandas-dev/pandas/issues/11699
|
||||
df = DataFrame(columns=["A", "B"])
|
||||
out = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(df.groupby("A").size(), out)
|
||||
|
||||
|
||||
def test_size_groupby_all_null():
|
||||
# https://github.com/pandas-dev/pandas/issues/23050
|
||||
# Assert no 'Value Error : Length of passed values is 2, index implies 0'
|
||||
df = DataFrame({"A": [None, None]}) # all-null groups
|
||||
result = df.groupby("A").size()
|
||||
expected = Series(dtype="int64", index=Index([], name="A"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_size_period_index():
|
||||
# https://github.com/pandas-dev/pandas/issues/34010
|
||||
ser = Series([1], index=PeriodIndex(["2000"], name="A", freq="D"))
|
||||
grp = ser.groupby(level="A")
|
||||
result = grp.size()
|
||||
tm.assert_series_equal(result, ser)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("as_index", [True, False])
|
||||
def test_size_on_categorical(as_index):
|
||||
df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"])
|
||||
df["A"] = df["A"].astype("category")
|
||||
result = df.groupby(["A", "B"], as_index=as_index, observed=False).size()
|
||||
|
||||
expected = DataFrame(
|
||||
[[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
|
||||
)
|
||||
expected["A"] = expected["A"].astype("category")
|
||||
if as_index:
|
||||
expected = expected.set_index(["A", "B"])["size"].rename(None)
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
|
||||
def test_size_series_masked_type_returns_Int64(dtype):
|
||||
# GH 54132
|
||||
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
|
||||
result = ser.groupby(level=0).size()
|
||||
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
object,
|
||||
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
|
||||
pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_size_strings(dtype):
|
||||
# GH#55627
|
||||
df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
|
||||
result = df.groupby("a")["b"].size()
|
||||
exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
|
||||
expected = Series(
|
||||
[2, 1],
|
||||
index=Index(["a", "b"], name="a", dtype=dtype),
|
||||
name="b",
|
||||
dtype=exp_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,27 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_groupby_skew_equivalence():
|
||||
# Test that that groupby skew method (which uses libgroupby.group_skew)
|
||||
# matches the results of operating group-by-group (which uses nanops.nanskew)
|
||||
nrows = 1000
|
||||
ngroups = 3
|
||||
ncols = 2
|
||||
nan_frac = 0.05
|
||||
|
||||
arr = np.random.default_rng(2).standard_normal((nrows, ncols))
|
||||
arr[np.random.default_rng(2).random(nrows) < nan_frac] = np.nan
|
||||
|
||||
df = pd.DataFrame(arr)
|
||||
grps = np.random.default_rng(2).integers(0, ngroups, size=nrows)
|
||||
gb = df.groupby(grps)
|
||||
|
||||
result = gb.skew()
|
||||
|
||||
grpwise = [grp.skew().to_frame(i).T for i, grp in gb]
|
||||
expected = pd.concat(grpwise, axis=0)
|
||||
expected.index = expected.index.astype(result.index.dtype) # 32bit builds
|
||||
tm.assert_frame_equal(result, expected)
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user