forked from Alsan/Post_finder
venv
This commit is contained in:
@ -0,0 +1,7 @@
|
||||
"""
|
||||
Test files dedicated to individual (stand-alone) DataFrame methods
|
||||
|
||||
Ideally these files/tests should correspond 1-to-1 with tests.series.methods
|
||||
|
||||
These may also present opportunities for sharing/de-duplicating test code.
|
||||
"""
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
|
||||
from pandas import Index
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_add_prefix_suffix(float_frame):
|
||||
with_prefix = float_frame.add_prefix("foo#")
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_suffix = float_frame.add_suffix("#foo")
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_suffix.columns, expected)
|
||||
|
||||
with_pct_prefix = float_frame.add_prefix("%")
|
||||
expected = Index([f"%{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("%")
|
||||
expected = Index([f"{c}%" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_axis(float_frame):
|
||||
# GH 47819
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=0)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_prefix.index, expected)
|
||||
|
||||
with_prefix = float_frame.add_prefix("foo#", axis=1)
|
||||
expected = Index([f"foo#{c}" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_prefix.columns, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=0)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.index])
|
||||
tm.assert_index_equal(with_pct_suffix.index, expected)
|
||||
|
||||
with_pct_suffix = float_frame.add_suffix("#foo", axis=1)
|
||||
expected = Index([f"{c}#foo" for c in float_frame.columns])
|
||||
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||||
|
||||
|
||||
def test_add_prefix_suffix_invalid_axis(float_frame):
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_prefix("foo#", axis=2)
|
||||
|
||||
with pytest.raises(ValueError, match="No axis named 2 for object type DataFrame"):
|
||||
float_frame.add_suffix("foo#", axis=2)
|
@ -0,0 +1,484 @@
|
||||
from datetime import timezone
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameAlign:
|
||||
def test_align_asfreq_method_raises(self):
|
||||
df = DataFrame({"A": [1, np.nan, 2]})
|
||||
msg = "Invalid fill method"
|
||||
msg2 = "The 'method', 'limit', and 'fill_axis' keywords"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg2):
|
||||
df.align(df.iloc[::-1], method="asfreq")
|
||||
|
||||
def test_frame_align_aware(self):
|
||||
idx1 = date_range("2001", periods=5, freq="h", tz="US/Eastern")
|
||||
idx2 = date_range("2001", periods=5, freq="2h", tz="US/Eastern")
|
||||
df1 = DataFrame(np.random.default_rng(2).standard_normal((len(idx1), 3)), idx1)
|
||||
df2 = DataFrame(np.random.default_rng(2).standard_normal((len(idx2), 3)), idx2)
|
||||
new1, new2 = df1.align(df2)
|
||||
assert df1.index.tz == new1.index.tz
|
||||
assert df2.index.tz == new2.index.tz
|
||||
|
||||
# different timezones convert to UTC
|
||||
|
||||
# frame with frame
|
||||
df1_central = df1.tz_convert("US/Central")
|
||||
new1, new2 = df1.align(df1_central)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
# frame with Series
|
||||
new1, new2 = df1.align(df1_central[0], axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
df1[0].align(df1_central, axis=0)
|
||||
assert new1.index.tz is timezone.utc
|
||||
assert new2.index.tz is timezone.utc
|
||||
|
||||
def test_align_float(self, float_frame, using_copy_on_write):
|
||||
af, bf = float_frame.align(float_frame)
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
af, bf = float_frame.align(float_frame, copy=False)
|
||||
if not using_copy_on_write:
|
||||
assert af._mgr is float_frame._mgr
|
||||
else:
|
||||
assert af._mgr is not float_frame._mgr
|
||||
|
||||
# axis = 0
|
||||
other = float_frame.iloc[:-5, :3]
|
||||
af, bf = float_frame.align(other, axis=0, fill_value=-1)
|
||||
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="right", axis=0)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
tm.assert_index_equal(af.index, other.index)
|
||||
|
||||
# axis = 1
|
||||
other = float_frame.iloc[:-5, :3].copy()
|
||||
af, bf = float_frame.align(other, axis=1)
|
||||
tm.assert_index_equal(bf.columns, float_frame.columns)
|
||||
tm.assert_index_equal(bf.index, other.index)
|
||||
|
||||
# test fill value
|
||||
join_idx = float_frame.index.join(other.index)
|
||||
diff_a = float_frame.index.difference(join_idx)
|
||||
diff_a_vals = af.reindex(diff_a).values
|
||||
|
||||
assert (diff_a_vals == -1).all()
|
||||
|
||||
af, bf = float_frame.align(other, join="inner", axis=1)
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(other, join="inner", axis=1, method="pad")
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))
|
||||
|
||||
# Try to align DataFrame to Series along bad axis
|
||||
msg = "No axis named 2 for object type DataFrame"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
float_frame.align(af.iloc[0, :3], join="inner", axis=2)
|
||||
|
||||
def test_align_frame_with_series(self, float_frame):
|
||||
# align dataframe to series with broadcast or not
|
||||
idx = float_frame.index
|
||||
s = Series(range(len(idx)), index=idx)
|
||||
|
||||
left, right = float_frame.align(s, axis=0)
|
||||
tm.assert_index_equal(left.index, float_frame.index)
|
||||
tm.assert_index_equal(right.index, float_frame.index)
|
||||
assert isinstance(right, Series)
|
||||
|
||||
msg = "The 'broadcast_axis' keyword in DataFrame.align is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
left, right = float_frame.align(s, broadcast_axis=1)
|
||||
tm.assert_index_equal(left.index, float_frame.index)
|
||||
expected = {c: s for c in float_frame.columns}
|
||||
expected = DataFrame(
|
||||
expected, index=float_frame.index, columns=float_frame.columns
|
||||
)
|
||||
tm.assert_frame_equal(right, expected)
|
||||
|
||||
def test_align_series_condition(self):
|
||||
# see gh-9558
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
||||
result = df[df["a"] == 2]
|
||||
expected = DataFrame([[2, 5]], index=[1], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.where(df["a"] == 2, 0)
|
||||
expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_align_int(self, int_frame):
|
||||
# test other non-float types
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = int_frame.align(other, join="inner", axis=1, method="pad")
|
||||
tm.assert_index_equal(bf.columns, other.columns)
|
||||
|
||||
def test_align_mixed_type(self, float_string_frame):
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = float_string_frame.align(
|
||||
float_string_frame, join="inner", axis=1, method="pad"
|
||||
)
|
||||
tm.assert_index_equal(bf.columns, float_string_frame.columns)
|
||||
|
||||
def test_align_mixed_float(self, mixed_float_frame):
|
||||
# mixed floats/ints
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = mixed_float_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
def test_align_mixed_int(self, mixed_int_frame):
|
||||
other = DataFrame(index=range(5), columns=["A", "B", "C"])
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
af, bf = mixed_int_frame.align(
|
||||
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
|
||||
)
|
||||
tm.assert_index_equal(bf.index, Index([]))
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"l_ordered,r_ordered,expected",
|
||||
[
|
||||
[True, True, pd.CategoricalIndex],
|
||||
[True, False, Index],
|
||||
[False, True, Index],
|
||||
[False, False, pd.CategoricalIndex],
|
||||
],
|
||||
)
|
||||
def test_align_categorical(self, l_ordered, r_ordered, expected):
|
||||
# GH-28397
|
||||
df_1 = DataFrame(
|
||||
{
|
||||
"A": np.arange(6, dtype="int64"),
|
||||
"B": Series(list("aabbca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=l_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
df_2 = DataFrame(
|
||||
{
|
||||
"A": np.arange(5, dtype="int64"),
|
||||
"B": Series(list("babca")).astype(
|
||||
pd.CategoricalDtype(list("cab"), ordered=r_ordered)
|
||||
),
|
||||
}
|
||||
).set_index("B")
|
||||
|
||||
aligned_1, aligned_2 = df_1.align(df_2)
|
||||
assert isinstance(aligned_1.index, expected)
|
||||
assert isinstance(aligned_2.index, expected)
|
||||
tm.assert_index_equal(aligned_1.index, aligned_2.index)
|
||||
|
||||
def test_align_multiindex(self):
|
||||
# GH#10665
|
||||
# same test cases as test_align_multiindex in test_series.py
|
||||
|
||||
midx = pd.MultiIndex.from_product(
|
||||
[range(2), range(3), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
idx = Index(range(2), name="b")
|
||||
df1 = DataFrame(np.arange(12, dtype="int64"), index=midx)
|
||||
df2 = DataFrame(np.arange(2, dtype="int64"), index=idx)
|
||||
|
||||
# these must be the same results (but flipped)
|
||||
res1l, res1r = df1.align(df2, join="left")
|
||||
res2l, res2r = df2.align(df1, join="right")
|
||||
|
||||
expl = df1
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1, np.nan, np.nan] * 2, index=midx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
res1l, res1r = df1.align(df2, join="right")
|
||||
res2l, res2r = df2.align(df1, join="left")
|
||||
|
||||
exp_idx = pd.MultiIndex.from_product(
|
||||
[range(2), range(2), range(2)], names=("a", "b", "c")
|
||||
)
|
||||
expl = DataFrame([0, 1, 2, 3, 6, 7, 8, 9], index=exp_idx)
|
||||
tm.assert_frame_equal(expl, res1l)
|
||||
tm.assert_frame_equal(expl, res2r)
|
||||
expr = DataFrame([0, 0, 1, 1] * 2, index=exp_idx)
|
||||
tm.assert_frame_equal(expr, res1r)
|
||||
tm.assert_frame_equal(expr, res2l)
|
||||
|
||||
def test_align_series_combinations(self):
|
||||
df = DataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
|
||||
s = Series([1, 2, 4], index=list("ABD"), name="x")
|
||||
|
||||
# frame + series
|
||||
res1, res2 = df.align(s, axis=0)
|
||||
exp1 = DataFrame(
|
||||
{"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
|
||||
index=list("ABCDE"),
|
||||
)
|
||||
exp2 = Series([1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x")
|
||||
|
||||
tm.assert_frame_equal(res1, exp1)
|
||||
tm.assert_series_equal(res2, exp2)
|
||||
|
||||
# series + frame
|
||||
res1, res2 = s.align(df)
|
||||
tm.assert_series_equal(res1, exp2)
|
||||
tm.assert_frame_equal(res2, exp1)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_left(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3, 4], index=Index([1, 2, 3, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(6)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_right(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2, 3, 4], name="bar")
|
||||
|
||||
series = Series([1, 2], index=Index([1, 2], name="bar"), name="foo_series")
|
||||
df = DataFrame(
|
||||
{"col": np.arange(12)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series(
|
||||
[1, 2, np.nan, np.nan] * 3, index=df.index, name="foo_series"
|
||||
)
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_missing_in_both(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 3, 4], name="bar")
|
||||
|
||||
series = Series(
|
||||
[1, 2, 3], index=Index([1, 2, 4], name="bar"), name="foo_series"
|
||||
)
|
||||
df = DataFrame(
|
||||
{"col": np.arange(9)},
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
|
||||
expected_r = Series([1, np.nan, 3] * 3, index=df.index, name="foo_series")
|
||||
result_l, result_r = df.align(series, axis=0)
|
||||
|
||||
tm.assert_frame_equal(result_l, df)
|
||||
tm.assert_series_equal(result_r, expected_r)
|
||||
|
||||
def test_multiindex_align_to_series_with_common_index_level_non_unique_cols(self):
|
||||
# GH-46001
|
||||
foo_index = Index([1, 2, 3], name="foo")
|
||||
bar_index = Index([1, 2], name="bar")
|
||||
|
||||
series = Series([1, 2], index=bar_index, name="foo_series")
|
||||
df = DataFrame(
|
||||
np.arange(18).reshape(6, 3),
|
||||
index=pd.MultiIndex.from_product([foo_index, bar_index]),
|
||||
)
|
||||
df.columns = ["cfoo", "cbar", "cfoo"]
|
||||
|
||||
expected = Series([1, 2] * 3, index=df.index, name="foo_series")
|
||||
result_left, result_right = df.align(series, axis=0)
|
||||
|
||||
tm.assert_series_equal(result_right, expected)
|
||||
tm.assert_index_equal(result_left.columns, df.columns)
|
||||
|
||||
def test_missing_axis_specification_exception(self):
|
||||
df = DataFrame(np.arange(50).reshape((10, 5)))
|
||||
series = Series(np.arange(5))
|
||||
|
||||
with pytest.raises(ValueError, match=r"axis=0 or 1"):
|
||||
df.align(series)
|
||||
|
||||
@pytest.mark.parametrize("method", ["pad", "bfill"])
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
@pytest.mark.parametrize("fill_axis", [0, 1])
|
||||
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
|
||||
@pytest.mark.parametrize(
|
||||
"left_slice",
|
||||
[
|
||||
[slice(4), slice(10)],
|
||||
[slice(0), slice(0)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"right_slice",
|
||||
[
|
||||
[slice(2, None), slice(6, None)],
|
||||
[slice(0), slice(0)],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("limit", [1, None])
|
||||
def test_align_fill_method(
|
||||
self, how, method, axis, fill_axis, float_frame, left_slice, right_slice, limit
|
||||
):
|
||||
frame = float_frame
|
||||
left = frame.iloc[left_slice[0], left_slice[1]]
|
||||
right = frame.iloc[right_slice[0], right_slice[1]]
|
||||
|
||||
msg = (
|
||||
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
|
||||
"are deprecated"
|
||||
)
|
||||
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
aa, ab = left.align(
|
||||
right,
|
||||
axis=axis,
|
||||
join=how,
|
||||
method=method,
|
||||
limit=limit,
|
||||
fill_axis=fill_axis,
|
||||
)
|
||||
|
||||
join_index, join_columns = None, None
|
||||
|
||||
ea, eb = left, right
|
||||
if axis is None or axis == 0:
|
||||
join_index = left.index.join(right.index, how=how)
|
||||
ea = ea.reindex(index=join_index)
|
||||
eb = eb.reindex(index=join_index)
|
||||
|
||||
if axis is None or axis == 1:
|
||||
join_columns = left.columns.join(right.columns, how=how)
|
||||
ea = ea.reindex(columns=join_columns)
|
||||
eb = eb.reindex(columns=join_columns)
|
||||
|
||||
msg = "DataFrame.fillna with 'method' is deprecated"
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
ea = ea.fillna(axis=fill_axis, method=method, limit=limit)
|
||||
eb = eb.fillna(axis=fill_axis, method=method, limit=limit)
|
||||
|
||||
tm.assert_frame_equal(aa, ea)
|
||||
tm.assert_frame_equal(ab, eb)
|
||||
|
||||
def test_align_series_check_copy(self):
|
||||
# GH#
|
||||
df = DataFrame({0: [1, 2]})
|
||||
ser = Series([1], name=0)
|
||||
expected = ser.copy()
|
||||
result, other = df.align(ser, axis=1)
|
||||
ser.iloc[0] = 100
|
||||
tm.assert_series_equal(other, expected)
|
||||
|
||||
def test_align_identical_different_object(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([3, 4])
|
||||
result, result2 = df.align(ser, axis=0)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
||||
|
||||
def test_align_identical_different_object_columns(self):
|
||||
# GH#51032
|
||||
df = DataFrame({"a": [1, 2]})
|
||||
ser = Series([1], index=["a"])
|
||||
result, result2 = df.align(ser, axis=1)
|
||||
tm.assert_frame_equal(result, df)
|
||||
tm.assert_series_equal(result2, ser)
|
||||
assert df is not result
|
||||
assert ser is not result2
|
@ -0,0 +1,263 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs.offsets import MonthEnd
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Series,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.tseries import offsets
|
||||
|
||||
|
||||
class TestAsFreq:
|
||||
@pytest.fixture(params=["s", "ms", "us", "ns"])
|
||||
def unit(self, request):
|
||||
return request.param
|
||||
|
||||
def test_asfreq2(self, frame_or_series):
|
||||
ts = frame_or_series(
|
||||
[0.0, 1.0, 2.0],
|
||||
index=DatetimeIndex(
|
||||
[
|
||||
datetime(2009, 10, 30),
|
||||
datetime(2009, 11, 30),
|
||||
datetime(2009, 12, 31),
|
||||
],
|
||||
dtype="M8[ns]",
|
||||
freq="BME",
|
||||
),
|
||||
)
|
||||
|
||||
daily_ts = ts.asfreq("B")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq("B", method="pad")
|
||||
monthly_ts = daily_ts.asfreq("BME")
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
daily_ts = ts.asfreq(offsets.BDay())
|
||||
monthly_ts = daily_ts.asfreq(offsets.BMonthEnd())
|
||||
tm.assert_equal(monthly_ts, ts)
|
||||
|
||||
result = ts[:0].asfreq("ME")
|
||||
assert len(result) == 0
|
||||
assert result is not ts
|
||||
|
||||
if frame_or_series is Series:
|
||||
daily_ts = ts.asfreq("D", fill_value=-1)
|
||||
result = daily_ts.value_counts().sort_index()
|
||||
expected = Series(
|
||||
[60, 1, 1, 1], index=[-1.0, 2.0, 1.0, 0.0], name="count"
|
||||
).sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_asfreq_datetimeindex_empty(self, frame_or_series):
|
||||
# GH#14320
|
||||
index = DatetimeIndex(["2016-09-29 11:00"])
|
||||
expected = frame_or_series(index=index, dtype=object).asfreq("h")
|
||||
result = frame_or_series([3], index=index.copy()).asfreq("h")
|
||||
tm.assert_index_equal(expected.index, result.index)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_tz_aware_asfreq_smoke(self, tz, frame_or_series):
|
||||
dr = date_range("2011-12-01", "2012-07-20", freq="D", tz=tz)
|
||||
|
||||
obj = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(dr)), index=dr
|
||||
)
|
||||
|
||||
# it works!
|
||||
obj.asfreq("min")
|
||||
|
||||
def test_asfreq_normalize(self, frame_or_series):
|
||||
rng = date_range("1/1/2000 09:30", periods=20)
|
||||
norm = date_range("1/1/2000", periods=20)
|
||||
|
||||
vals = np.random.default_rng(2).standard_normal((20, 3))
|
||||
|
||||
obj = DataFrame(vals, index=rng)
|
||||
expected = DataFrame(vals, index=norm)
|
||||
if frame_or_series is Series:
|
||||
obj = obj[0]
|
||||
expected = expected[0]
|
||||
|
||||
result = obj.asfreq("D", normalize=True)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_keep_index_name(self, frame_or_series):
|
||||
# GH#9854
|
||||
index_name = "bar"
|
||||
index = date_range("20130101", periods=20, name=index_name)
|
||||
obj = DataFrame(list(range(20)), columns=["foo"], index=index)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
assert index_name == obj.index.name
|
||||
assert index_name == obj.asfreq("10D").index.name
|
||||
|
||||
def test_asfreq_ts(self, frame_or_series):
|
||||
index = period_range(freq="Y", start="1/1/2001", end="12/31/2010")
|
||||
obj = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 3)), index=index
|
||||
)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
result = obj.asfreq("D", how="end")
|
||||
exp_index = index.asfreq("D", how="end")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
result = obj.asfreq("D", how="start")
|
||||
exp_index = index.asfreq("D", how="start")
|
||||
assert len(result) == len(obj)
|
||||
tm.assert_index_equal(result.index, exp_index)
|
||||
|
||||
def test_asfreq_resample_set_correct_freq(self, frame_or_series):
|
||||
# GH#5613
|
||||
# we test if .asfreq() and .resample() set the correct value for .freq
|
||||
dti = to_datetime(["2012-01-01", "2012-01-02", "2012-01-03"])
|
||||
obj = DataFrame({"col": [1, 2, 3]}, index=dti)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
# testing the settings before calling .asfreq() and .resample()
|
||||
assert obj.index.freq is None
|
||||
assert obj.index.inferred_freq == "D"
|
||||
|
||||
# does .asfreq() set .freq correctly?
|
||||
assert obj.asfreq("D").index.freq == "D"
|
||||
|
||||
# does .resample() set .freq correctly?
|
||||
assert obj.resample("D").asfreq().index.freq == "D"
|
||||
|
||||
def test_asfreq_empty(self, datetime_frame):
|
||||
# test does not blow up on length-0 DataFrame
|
||||
zero_length = datetime_frame.reindex([])
|
||||
result = zero_length.asfreq("BME")
|
||||
assert result is not zero_length
|
||||
|
||||
def test_asfreq(self, datetime_frame):
|
||||
offset_monthly = datetime_frame.asfreq(offsets.BMonthEnd())
|
||||
rule_monthly = datetime_frame.asfreq("BME")
|
||||
|
||||
tm.assert_frame_equal(offset_monthly, rule_monthly)
|
||||
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
# TODO: actually check that this worked.
|
||||
|
||||
# don't forget!
|
||||
rule_monthly.asfreq("B", method="pad")
|
||||
|
||||
def test_asfreq_datetimeindex(self):
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3]},
|
||||
index=[datetime(2011, 11, 1), datetime(2011, 11, 2), datetime(2011, 11, 3)],
|
||||
)
|
||||
df = df.asfreq("B")
|
||||
assert isinstance(df.index, DatetimeIndex)
|
||||
|
||||
ts = df["A"].asfreq("B")
|
||||
assert isinstance(ts.index, DatetimeIndex)
|
||||
|
||||
def test_asfreq_fillvalue(self):
|
||||
# test for fill value during upsampling, related to issue 3715
|
||||
|
||||
# setup
|
||||
rng = date_range("1/1/2016", periods=10, freq="2s")
|
||||
# Explicit cast to 'float' to avoid implicit cast when setting None
|
||||
ts = Series(np.arange(len(rng)), index=rng, dtype="float")
|
||||
df = DataFrame({"one": ts})
|
||||
|
||||
# insert pre-existing missing value
|
||||
df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
|
||||
actual_df = df.asfreq(freq="1s", fill_value=9.0)
|
||||
expected_df = df.asfreq(freq="1s").fillna(9.0)
|
||||
expected_df.loc["2016-01-01 00:00:08", "one"] = None
|
||||
tm.assert_frame_equal(expected_df, actual_df)
|
||||
|
||||
expected_series = ts.asfreq(freq="1s").fillna(9.0)
|
||||
actual_series = ts.asfreq(freq="1s", fill_value=9.0)
|
||||
tm.assert_series_equal(expected_series, actual_series)
|
||||
|
||||
def test_asfreq_with_date_object_index(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", periods=20)
|
||||
ts = frame_or_series(np.random.default_rng(2).standard_normal(20), index=rng)
|
||||
|
||||
ts2 = ts.copy()
|
||||
ts2.index = [x.date() for x in ts2.index]
|
||||
|
||||
result = ts2.asfreq("4h", method="ffill")
|
||||
expected = ts.asfreq("4h", method="ffill")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_with_unsorted_index(self, frame_or_series):
|
||||
# GH#39805
|
||||
# Test that rows are not dropped when the datetime index is out of order
|
||||
index = to_datetime(["2021-01-04", "2021-01-02", "2021-01-03", "2021-01-01"])
|
||||
result = frame_or_series(range(4), index=index)
|
||||
|
||||
expected = result.reindex(sorted(index))
|
||||
expected.index = expected.index._with_freq("infer")
|
||||
|
||||
result = result.asfreq("D")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_asfreq_after_normalize(self, unit):
|
||||
# https://github.com/pandas-dev/pandas/issues/50727
|
||||
result = DatetimeIndex(
|
||||
date_range("2000", periods=2).as_unit(unit).normalize(), freq="D"
|
||||
)
|
||||
expected = DatetimeIndex(["2000-01-01", "2000-01-02"], freq="D").as_unit(unit)
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_half",
|
||||
[
|
||||
("2ME", "ME"),
|
||||
(MonthEnd(2), MonthEnd(1)),
|
||||
],
|
||||
)
|
||||
def test_asfreq_2ME(self, freq, freq_half):
|
||||
index = date_range("1/1/2000", periods=6, freq=freq_half)
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], index=index)})
|
||||
expected = df.asfreq(freq=freq)
|
||||
|
||||
index = date_range("1/1/2000", periods=3, freq=freq)
|
||||
result = DataFrame({"s": Series([0.0, 2.0, 4.0], index=index)})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"freq, freq_depr",
|
||||
[
|
||||
("2ME", "2M"),
|
||||
("2QE", "2Q"),
|
||||
("2QE-SEP", "2Q-SEP"),
|
||||
("1BQE", "1BQ"),
|
||||
("2BQE-SEP", "2BQ-SEP"),
|
||||
("1YE", "1Y"),
|
||||
("2YE-MAR", "2Y-MAR"),
|
||||
("1YE", "1A"),
|
||||
("2YE-MAR", "2A-MAR"),
|
||||
("2BYE-MAR", "2BA-MAR"),
|
||||
],
|
||||
)
|
||||
def test_asfreq_frequency_M_Q_Y_A_deprecated(self, freq, freq_depr):
|
||||
# GH#9586, #55978
|
||||
depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed "
|
||||
f"in a future version, please use '{freq[1:]}' instead."
|
||||
|
||||
index = date_range("1/1/2000", periods=4, freq=f"{freq[1:]}")
|
||||
df = DataFrame({"s": Series([0.0, 1.0, 2.0, 3.0], index=index)})
|
||||
expected = df.asfreq(freq=freq)
|
||||
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
|
||||
result = df.asfreq(freq=freq_depr)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,198 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import IncompatibleFrequency
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Period,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
period_range,
|
||||
to_datetime,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def date_range_frame():
|
||||
"""
|
||||
Fixture for DataFrame of ints with date_range index
|
||||
|
||||
Columns are ['A', 'B'].
|
||||
"""
|
||||
N = 50
|
||||
rng = date_range("1/1/1990", periods=N, freq="53s")
|
||||
return DataFrame({"A": np.arange(N), "B": np.arange(N)}, index=rng)
|
||||
|
||||
|
||||
class TestFrameAsof:
|
||||
def test_basic(self, date_range_frame):
|
||||
# Explicitly cast to float to avoid implicit cast when setting np.nan
|
||||
df = date_range_frame.astype({"A": "float"})
|
||||
N = 50
|
||||
df.loc[df.index[15:30], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
lb = df.index[14]
|
||||
ub = df.index[30]
|
||||
|
||||
dates = list(dates)
|
||||
|
||||
result = df.asof(dates)
|
||||
assert result.notna().all(1).all()
|
||||
|
||||
mask = (result.index >= lb) & (result.index < ub)
|
||||
rs = result[mask]
|
||||
assert (rs == 14).all(1).all()
|
||||
|
||||
def test_subset(self, date_range_frame):
|
||||
N = 10
|
||||
# explicitly cast to float to avoid implicit upcast when setting to np.nan
|
||||
df = date_range_frame.iloc[:N].copy().astype({"A": "float"})
|
||||
df.loc[df.index[4:8], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
# with a subset of A should be the same
|
||||
result = df.asof(dates, subset="A")
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# same with A/B
|
||||
result = df.asof(dates, subset=["A", "B"])
|
||||
expected = df.asof(dates)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# B gives df.asof
|
||||
result = df.asof(dates, subset="B")
|
||||
expected = df.resample("25s", closed="right").ffill().reindex(dates)
|
||||
expected.iloc[20:] = 9
|
||||
# no "missing", so "B" can retain int dtype (df["A"].dtype platform-dependent)
|
||||
expected["B"] = expected["B"].astype(df["B"].dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing(self, date_range_frame):
|
||||
# GH 15118
|
||||
# no match found - `where` value before earliest date in index
|
||||
N = 10
|
||||
# Cast to 'float64' to avoid upcast when introducing nan in df.asof
|
||||
df = date_range_frame.iloc[:N].copy().astype("float64")
|
||||
|
||||
result = df.asof("1989-12-31")
|
||||
|
||||
expected = Series(
|
||||
index=["A", "B"], name=Timestamp("1989-12-31"), dtype=np.float64
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = df.asof(to_datetime(["1989-12-31"]))
|
||||
expected = DataFrame(
|
||||
index=to_datetime(["1989-12-31"]), columns=["A", "B"], dtype="float64"
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Check that we handle PeriodIndex correctly, dont end up with
|
||||
# period.ordinal for series name
|
||||
df = df.to_period("D")
|
||||
result = df.asof("1989-12-31")
|
||||
assert isinstance(result.name, Period)
|
||||
|
||||
def test_asof_all_nans(self, frame_or_series):
|
||||
# GH 15713
|
||||
# DataFrame/Series is all nans
|
||||
result = frame_or_series([np.nan]).asof([0])
|
||||
expected = frame_or_series([np.nan])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_all_nans(self, date_range_frame):
|
||||
# GH 15713
|
||||
# DataFrame is all nans
|
||||
|
||||
# testing non-default indexes, multiple inputs
|
||||
N = 150
|
||||
rng = date_range_frame.index
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing multiple columns
|
||||
dates = date_range("1/1/1990", periods=N, freq="25s")
|
||||
result = DataFrame(np.nan, index=rng, columns=["A", "B", "C"]).asof(dates)
|
||||
expected = DataFrame(np.nan, index=dates, columns=["A", "B", "C"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# testing scalar input
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof([3])
|
||||
expected = DataFrame(np.nan, index=[3], columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = DataFrame(np.nan, index=[1, 2], columns=["A", "B"]).asof(3)
|
||||
expected = Series(np.nan, index=["A", "B"], name=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"stamp,expected",
|
||||
[
|
||||
(
|
||||
Timestamp("2018-01-01 23:22:43.325+00:00"),
|
||||
Series(2, name=Timestamp("2018-01-01 23:22:43.325+00:00")),
|
||||
),
|
||||
(
|
||||
Timestamp("2018-01-01 22:33:20.682+01:00"),
|
||||
Series(1, name=Timestamp("2018-01-01 22:33:20.682+01:00")),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_time_zone_aware_index(self, stamp, expected):
|
||||
# GH21194
|
||||
# Testing awareness of DataFrame index considering different
|
||||
# UTC and timezone
|
||||
df = DataFrame(
|
||||
data=[1, 2],
|
||||
index=[
|
||||
Timestamp("2018-01-01 21:00:05.001+00:00"),
|
||||
Timestamp("2018-01-01 22:35:10.550+00:00"),
|
||||
],
|
||||
)
|
||||
|
||||
result = df.asof(stamp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_is_copy(self, date_range_frame):
|
||||
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
|
||||
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
|
||||
df = date_range_frame.astype({"A": "float"})
|
||||
N = 50
|
||||
df.loc[df.index[15:30], "A"] = np.nan
|
||||
dates = date_range("1/1/1990", periods=N * 3, freq="25s")
|
||||
|
||||
result = df.asof(dates)
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
result["C"] = 1
|
||||
|
||||
def test_asof_periodindex_mismatched_freq(self):
|
||||
N = 50
|
||||
rng = period_range("1/1/1990", periods=N, freq="h")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal(N), index=rng)
|
||||
|
||||
# Mismatched freq
|
||||
msg = "Input has different freq"
|
||||
with pytest.raises(IncompatibleFrequency, match=msg):
|
||||
df.asof(rng.asfreq("D"))
|
||||
|
||||
def test_asof_preserves_bool_dtype(self):
|
||||
# GH#16063 was casting bools to floats
|
||||
dti = date_range("2017-01-01", freq="MS", periods=4)
|
||||
ser = Series([True, False, True], index=dti[:-1])
|
||||
|
||||
ts = dti[-1]
|
||||
res = ser.asof([ts])
|
||||
|
||||
expected = Series([True], index=[ts])
|
||||
tm.assert_series_equal(res, expected)
|
@ -0,0 +1,84 @@
|
||||
import pytest
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAssign:
|
||||
def test_assign(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
original = df.copy()
|
||||
result = df.assign(C=df.B / df.A)
|
||||
expected = df.copy()
|
||||
expected["C"] = [4, 2.5, 2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda syntax
|
||||
result = df.assign(C=lambda x: x.B / x.A)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# Non-Series array-like
|
||||
result = df.assign(C=[4, 2.5, 2])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# original is unmodified
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
result = df.assign(B=df.B / df.A)
|
||||
expected = expected.drop("B", axis=1).rename(columns={"C": "B"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# overwrite
|
||||
result = df.assign(A=df.A + df.B)
|
||||
expected = df.copy()
|
||||
expected["A"] = [5, 7, 9]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# lambda
|
||||
result = df.assign(A=lambda x: x.A + x.B)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_multiple(self):
|
||||
df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=["A", "B"])
|
||||
result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
|
||||
expected = DataFrame(
|
||||
[[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], [3, 6, 9, 3, 6]], columns=list("ABCDE")
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_order(self):
|
||||
# GH 9818
|
||||
df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
|
||||
result = df.assign(D=df.A + df.B, C=df.A - df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, 3, -1], [3, 4, 7, -1]], columns=list("ABDC"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = df.assign(C=df.A - df.B, D=df.A + df.B)
|
||||
|
||||
expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], columns=list("ABCD"))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_assign_bad(self):
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
|
||||
# non-keyword argument
|
||||
msg = r"assign\(\) takes 1 positional argument but 2 were given"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.assign(lambda x: x.A)
|
||||
msg = "'DataFrame' object has no attribute 'C'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
df.assign(C=df.A, D=df.A + df.C)
|
||||
|
||||
def test_assign_dependent(self):
|
||||
df = DataFrame({"A": [1, 2], "B": [3, 4]})
|
||||
|
||||
result = df.assign(C=df.A, D=lambda x: x["A"] + x["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.assign(C=lambda df: df.A, D=lambda df: df["A"] + df["C"])
|
||||
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], columns=list("ABCD"))
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,911 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
DatetimeTZDtype,
|
||||
Index,
|
||||
Interval,
|
||||
IntervalDtype,
|
||||
NaT,
|
||||
Series,
|
||||
Timedelta,
|
||||
Timestamp,
|
||||
concat,
|
||||
date_range,
|
||||
option_context,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def _check_cast(df, v):
|
||||
"""
|
||||
Check if all dtypes of df are equal to v
|
||||
"""
|
||||
assert all(s.dtype.name == v for _, s in df.items())
|
||||
|
||||
|
||||
class TestAstype:
|
||||
def test_astype_float(self, float_frame):
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
casted = float_frame.astype(np.int32)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(np.int32),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
float_frame["foo"] = "5"
|
||||
casted = float_frame.astype(int)
|
||||
expected = DataFrame(
|
||||
float_frame.values.astype(int),
|
||||
index=float_frame.index,
|
||||
columns=float_frame.columns,
|
||||
)
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_mixed_float(self, mixed_float_frame):
|
||||
# mixed casting
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
def test_astype_mixed_type(self):
|
||||
# mixed casting
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": 1.0,
|
||||
"b": 2,
|
||||
"c": "foo",
|
||||
"float32": np.array([1.0] * 10, dtype="float32"),
|
||||
"int32": np.array([1] * 10, dtype="int32"),
|
||||
},
|
||||
index=np.arange(10),
|
||||
)
|
||||
mn = df._get_numeric_data().copy()
|
||||
mn["little_float"] = np.array(12345.0, dtype="float16")
|
||||
mn["big_float"] = np.array(123456789101112.0, dtype="float64")
|
||||
|
||||
casted = mn.astype("float64")
|
||||
_check_cast(casted, "float64")
|
||||
|
||||
casted = mn.astype("int64")
|
||||
_check_cast(casted, "int64")
|
||||
|
||||
casted = mn.reindex(columns=["little_float"]).astype("float16")
|
||||
_check_cast(casted, "float16")
|
||||
|
||||
casted = mn.astype("float32")
|
||||
_check_cast(casted, "float32")
|
||||
|
||||
casted = mn.astype("int32")
|
||||
_check_cast(casted, "int32")
|
||||
|
||||
# to object
|
||||
casted = mn.astype("O")
|
||||
_check_cast(casted, "object")
|
||||
|
||||
def test_astype_with_exclude_string(self, float_frame):
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(int)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(int, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
df = float_frame.copy()
|
||||
expected = float_frame.astype(np.int32)
|
||||
df["string"] = "foo"
|
||||
casted = df.astype(np.int32, errors="ignore")
|
||||
|
||||
expected["string"] = "foo"
|
||||
tm.assert_frame_equal(casted, expected)
|
||||
|
||||
def test_astype_with_view_float(self, float_frame):
|
||||
# this is the only real reason to do it this way
|
||||
tf = np.round(float_frame).astype(np.int32)
|
||||
tf.astype(np.float32, copy=False)
|
||||
|
||||
# TODO(wesm): verification?
|
||||
tf = float_frame.astype(np.float64)
|
||||
tf.astype(np.int64, copy=False)
|
||||
|
||||
def test_astype_with_view_mixed_float(self, mixed_float_frame):
|
||||
tf = mixed_float_frame.reindex(columns=["A", "B", "C"])
|
||||
|
||||
tf.astype(np.int64)
|
||||
tf.astype(np.float32)
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int32, np.int64])
|
||||
@pytest.mark.parametrize("val", [np.nan, np.inf])
|
||||
def test_astype_cast_nan_inf_int(self, val, dtype):
|
||||
# see GH#14265
|
||||
#
|
||||
# Check NaN and inf --> raise error when converting to int.
|
||||
msg = "Cannot convert non-finite values \\(NA or inf\\) to integer"
|
||||
df = DataFrame([val])
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_str(self):
|
||||
# see GH#9757
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern"))
|
||||
c = Series([Timedelta(x, unit="d") for x in range(5)])
|
||||
d = Series(range(5))
|
||||
e = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e})
|
||||
|
||||
# Datetime-like
|
||||
result = df.astype(str)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": list(map(str, (Timestamp(x)._date_repr for x in a._values))),
|
||||
"b": list(map(str, map(Timestamp, b._values))),
|
||||
"c": [Timedelta(x)._repr_base() for x in c._values],
|
||||
"d": list(map(str, d._values)),
|
||||
"e": list(map(str, e._values)),
|
||||
},
|
||||
dtype="object",
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_str_float(self):
|
||||
# see GH#11302
|
||||
result = DataFrame([np.nan]).astype(str)
|
||||
expected = DataFrame(["nan"], dtype="object")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
result = DataFrame([1.12345678901234567890]).astype(str)
|
||||
|
||||
val = "1.1234567890123457"
|
||||
expected = DataFrame([val], dtype="object")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("dtype_class", [dict, Series])
|
||||
def test_astype_dict_like(self, dtype_class):
|
||||
# GH7271 & GH16717
|
||||
a = Series(date_range("2010-01-04", periods=5))
|
||||
b = Series(range(5))
|
||||
c = Series([0.0, 0.2, 0.4, 0.6, 0.8])
|
||||
d = Series(["1.0", "2", "3.14", "4", "5.4"])
|
||||
df = DataFrame({"a": a, "b": b, "c": c, "d": d})
|
||||
original = df.copy(deep=True)
|
||||
|
||||
# change type of a subset of columns
|
||||
dt1 = dtype_class({"b": "str", "d": "float32"})
|
||||
result = df.astype(dt1)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
|
||||
"c": c,
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64})
|
||||
result = df.astype(dt2)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": a,
|
||||
"b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"),
|
||||
"c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"),
|
||||
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# change all columns
|
||||
dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str})
|
||||
tm.assert_frame_equal(df.astype(dt3), df.astype(str))
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# error should be raised when using something other than column labels
|
||||
# in the keys of the dtype dict
|
||||
dt4 = dtype_class({"b": str, 2: str})
|
||||
dt5 = dtype_class({"e": str})
|
||||
msg_frame = (
|
||||
"Only a column name can be used for the key in a dtype mappings argument. "
|
||||
"'{}' not found in columns."
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg_frame.format(2)):
|
||||
df.astype(dt4)
|
||||
with pytest.raises(KeyError, match=msg_frame.format("e")):
|
||||
df.astype(dt5)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# if the dtypes provided are the same as the original dtypes, the
|
||||
# resulting DataFrame should be the same as the original DataFrame
|
||||
dt6 = dtype_class({col: df[col].dtype for col in df.columns})
|
||||
equiv = df.astype(dt6)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
# GH#16717
|
||||
# if dtypes provided is empty, the resulting DataFrame
|
||||
# should be the same as the original DataFrame
|
||||
dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object)
|
||||
equiv = df.astype(dt7)
|
||||
tm.assert_frame_equal(df, equiv)
|
||||
tm.assert_frame_equal(df, original)
|
||||
|
||||
def test_astype_duplicate_col(self):
|
||||
a1 = Series([1, 2, 3, 4, 5], name="a")
|
||||
b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b")
|
||||
a2 = Series([0, 1, 2, 3, 4], name="a")
|
||||
df = concat([a1, b, a2], axis=1)
|
||||
|
||||
result = df.astype(str)
|
||||
a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a")
|
||||
b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b")
|
||||
a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a")
|
||||
expected = concat([a1_str, b_str, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.astype({"a": "str"})
|
||||
expected = concat([a1_str, b, a2_str], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_duplicate_col_series_arg(self):
|
||||
# GH#44417
|
||||
vals = np.random.default_rng(2).standard_normal((3, 4))
|
||||
df = DataFrame(vals, columns=["A", "B", "C", "A"])
|
||||
dtypes = df.dtypes
|
||||
dtypes.iloc[0] = str
|
||||
dtypes.iloc[2] = "Float64"
|
||||
|
||||
result = df.astype(dtypes)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: Series(vals[:, 0].astype(str), dtype=object),
|
||||
1: vals[:, 1],
|
||||
2: pd.array(vals[:, 2], dtype="Float64"),
|
||||
3: vals[:, 3],
|
||||
}
|
||||
)
|
||||
expected.columns = df.columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"category",
|
||||
CategoricalDtype(),
|
||||
CategoricalDtype(ordered=True),
|
||||
CategoricalDtype(ordered=False),
|
||||
CategoricalDtype(categories=list("abcdef")),
|
||||
CategoricalDtype(categories=list("edba"), ordered=False),
|
||||
CategoricalDtype(categories=list("edcb"), ordered=True),
|
||||
],
|
||||
ids=repr,
|
||||
)
|
||||
def test_astype_categorical(self, dtype):
|
||||
# GH#18099
|
||||
d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")}
|
||||
df = DataFrame(d)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame({k: Categorical(v, dtype=dtype) for k, v in d.items()})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype])
|
||||
def test_astype_categoricaldtype_class_raises(self, cls):
|
||||
df = DataFrame({"A": ["a", "a", "b", "c"]})
|
||||
xpr = f"Expected an instance of {cls.__name__}"
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df.astype({"A": cls})
|
||||
|
||||
with pytest.raises(TypeError, match=xpr):
|
||||
df["A"].astype(cls)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
|
||||
def test_astype_extension_dtypes(self, dtype):
|
||||
# GH#22578
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
|
||||
expected1 = DataFrame(
|
||||
{
|
||||
"a": pd.array([1, 3, 5], dtype=dtype),
|
||||
"b": pd.array([2, 4, 6], dtype=dtype),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype(dtype).astype("float64"), df)
|
||||
|
||||
df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"])
|
||||
df["b"] = df["b"].astype(dtype)
|
||||
expected2 = DataFrame(
|
||||
{"a": [1.0, 3.0, 5.0], "b": pd.array([2, 4, 6], dtype=dtype)}
|
||||
)
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"])
|
||||
def test_astype_extension_dtypes_1d(self, dtype):
|
||||
# GH#22578
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
|
||||
expected1 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
df = DataFrame({"a": [1.0, 2.0, 3.0]})
|
||||
df["a"] = df["a"].astype(dtype)
|
||||
expected2 = DataFrame({"a": pd.array([1, 2, 3], dtype=dtype)})
|
||||
tm.assert_frame_equal(df, expected2)
|
||||
|
||||
tm.assert_frame_equal(df.astype(dtype), expected1)
|
||||
tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1)
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["category", "Int64"])
|
||||
def test_astype_extension_dtypes_duplicate_col(self, dtype):
|
||||
# GH#24704
|
||||
a1 = Series([0, np.nan, 4], name="a")
|
||||
a2 = Series([np.nan, 3, 5], name="a")
|
||||
df = concat([a1, a2], axis=1)
|
||||
|
||||
result = df.astype(dtype)
|
||||
expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype", [{100: "float64", 200: "uint64"}, "category", "float64"]
|
||||
)
|
||||
def test_astype_column_metadata(self, dtype):
|
||||
# GH#19920
|
||||
columns = Index([100, 200, 300], dtype=np.uint64, name="foo")
|
||||
df = DataFrame(np.arange(15).reshape(5, 3), columns=columns)
|
||||
df = df.astype(dtype)
|
||||
tm.assert_index_equal(df.columns, columns)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_datetime_unit(self, unit):
|
||||
vals = [
|
||||
["2015-01-01", "2015-01-02", "2015-01-03"],
|
||||
["2017-01-01", "2017-01-02", "2017-02-03"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
rf"Unexpected value for 'dtype': 'datetime64\[{unit}\]'. "
|
||||
r"Must be 'datetime64\[s\]', 'datetime64\[ms\]', 'datetime64\[us\]', "
|
||||
r"'datetime64\[ns\]' or DatetimeTZDtype"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(f"M8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("unit", ["Y", "M", "W", "D", "h", "m"])
|
||||
def test_astype_from_object_to_timedelta_unit(self, unit):
|
||||
vals = [
|
||||
["1 Day", "2 Days", "3 Days"],
|
||||
["4 Days", "5 Days", "6 Days"],
|
||||
]
|
||||
df = DataFrame(vals, dtype=object)
|
||||
msg = (
|
||||
r"Cannot convert from timedelta64\[ns\] to timedelta64\[.*\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
# TODO: this is ValueError while for DatetimeArray it is TypeError;
|
||||
# get these consistent
|
||||
df.astype(f"m8[{unit}]")
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_from_datetimelike_to_object(self, dtype, unit):
|
||||
# tests astype to object dtype
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(object)
|
||||
assert (result.dtypes == object).all()
|
||||
|
||||
if dtype.startswith("M8"):
|
||||
assert result.iloc[0, 0] == Timestamp(1, unit=unit)
|
||||
else:
|
||||
assert result.iloc[0, 0] == Timedelta(1, unit=unit)
|
||||
|
||||
@pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
|
||||
@pytest.mark.parametrize("dtype", ["M8", "m8"])
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit):
|
||||
# tests all units from numeric origination
|
||||
# GH#19223 / GH#12425
|
||||
dtype = f"{dtype}[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=arr_dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_datetime_unit(self, unit):
|
||||
# tests all units from datetime origination
|
||||
# GH#19223
|
||||
dtype = f"M8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
idx = Index(ser)
|
||||
dta = ser._values
|
||||
|
||||
if unit in ["ns", "us", "ms", "s"]:
|
||||
# GH#48928
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# we use the nearest supported dtype (i.e. M8[s])
|
||||
msg = rf"Cannot cast DatetimeArray to dtype datetime64\[{unit}\]"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg.replace("Array", "Index")):
|
||||
idx.astype(dtype)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
dta.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
exp_df = DataFrame(arr.astype(dtype))
|
||||
assert (exp_df.dtypes == dtype).all()
|
||||
tm.assert_frame_equal(result, exp_df)
|
||||
|
||||
res_ser = ser.astype(dtype)
|
||||
exp_ser = exp_df.iloc[:, 0]
|
||||
assert exp_ser.dtype == dtype
|
||||
tm.assert_series_equal(res_ser, exp_ser)
|
||||
|
||||
exp_dta = exp_ser._values
|
||||
|
||||
res_index = idx.astype(dtype)
|
||||
exp_index = Index(exp_ser)
|
||||
assert exp_index.dtype == dtype
|
||||
tm.assert_index_equal(res_index, exp_index)
|
||||
|
||||
res_dta = dta.astype(dtype)
|
||||
assert exp_dta.dtype == dtype
|
||||
tm.assert_extension_array_equal(res_dta, exp_dta)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns"])
|
||||
def test_astype_to_timedelta_unit_ns(self, unit):
|
||||
# preserver the timedelta conversion
|
||||
# GH#19223
|
||||
dtype = f"m8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
result = df.astype(dtype)
|
||||
expected = DataFrame(arr.astype(dtype))
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_timedelta_unit(self, unit):
|
||||
# coerce to float
|
||||
# GH#19223 until 2.0 used to coerce to float
|
||||
dtype = f"m8[{unit}]"
|
||||
arr = np.array([[1, 2, 3]], dtype=dtype)
|
||||
df = DataFrame(arr)
|
||||
ser = df.iloc[:, 0]
|
||||
tdi = Index(ser)
|
||||
tda = tdi._values
|
||||
|
||||
if unit in ["us", "ms", "s"]:
|
||||
assert (df.dtypes == dtype).all()
|
||||
result = df.astype(dtype)
|
||||
else:
|
||||
# We get the nearest supported unit, i.e. "s"
|
||||
assert (df.dtypes == "m8[s]").all()
|
||||
|
||||
msg = (
|
||||
rf"Cannot convert from timedelta64\[s\] to timedelta64\[{unit}\]. "
|
||||
"Supported resolutions are 's', 'ms', 'us', 'ns'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tdi.astype(dtype)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
tda.astype(dtype)
|
||||
|
||||
return
|
||||
|
||||
result = df.astype(dtype)
|
||||
# The conversion is a no-op, so we just get a copy
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
|
||||
def test_astype_to_incorrect_datetimelike(self, unit):
|
||||
# trying to astype a m to a M, or vice-versa
|
||||
# GH#19224
|
||||
dtype = f"M8[{unit}]"
|
||||
other = f"m8[{unit}]"
|
||||
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=dtype))
|
||||
msg = "|".join(
|
||||
[
|
||||
# BlockManager path
|
||||
rf"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]",
|
||||
# ArrayManager path
|
||||
"cannot astype a datetimelike from "
|
||||
rf"\[datetime64\[ns\]\] to \[timedelta64\[{unit}\]\]",
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(other)
|
||||
|
||||
msg = "|".join(
|
||||
[
|
||||
# BlockManager path
|
||||
rf"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]",
|
||||
# ArrayManager path
|
||||
"cannot astype a timedelta from "
|
||||
rf"\[timedelta64\[ns\]\] to \[datetime64\[{unit}\]\]",
|
||||
]
|
||||
)
|
||||
df = DataFrame(np.array([[1, 2, 3]], dtype=other))
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.astype(dtype)
|
||||
|
||||
def test_astype_arg_for_errors(self):
|
||||
# GH#14878
|
||||
|
||||
df = DataFrame([1, 2, 3])
|
||||
|
||||
msg = (
|
||||
"Expected value of kwarg 'errors' to be one of "
|
||||
"['raise', 'ignore']. Supplied value is 'True'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype(np.float64, errors=True)
|
||||
|
||||
df.astype(np.int8, errors="ignore")
|
||||
|
||||
def test_astype_invalid_conversion(self):
|
||||
# GH#47571
|
||||
df = DataFrame({"a": [1, 2, "text"], "b": [1, 2, 3]})
|
||||
|
||||
msg = (
|
||||
"invalid literal for int() with base 10: 'text': "
|
||||
"Error while type casting for column 'a'"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=re.escape(msg)):
|
||||
df.astype({"a": int})
|
||||
|
||||
def test_astype_arg_for_errors_dictlist(self):
|
||||
# GH#25905
|
||||
df = DataFrame(
|
||||
[
|
||||
{"a": "1", "b": "16.5%", "c": "test"},
|
||||
{"a": "2.2", "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected = DataFrame(
|
||||
[
|
||||
{"a": 1.0, "b": "16.5%", "c": "test"},
|
||||
{"a": 2.2, "b": "15.3", "c": "another_test"},
|
||||
]
|
||||
)
|
||||
expected["c"] = expected["c"].astype("object")
|
||||
type_dict = {"a": "float64", "b": "float64", "c": "object"}
|
||||
|
||||
result = df.astype(dtype=type_dict, errors="ignore")
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64tz(self, timezone_frame):
|
||||
# astype
|
||||
expected = np.array(
|
||||
[
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00"),
|
||||
Timestamp("2013-01-02 00:00:00"),
|
||||
Timestamp("2013-01-03 00:00:00"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
|
||||
],
|
||||
[
|
||||
Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
|
||||
NaT,
|
||||
Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
|
||||
],
|
||||
],
|
||||
dtype=object,
|
||||
).T
|
||||
expected = DataFrame(
|
||||
expected,
|
||||
index=timezone_frame.index,
|
||||
columns=timezone_frame.columns,
|
||||
dtype=object,
|
||||
)
|
||||
result = timezone_frame.astype(object)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
msg = "Cannot use .astype to convert from timezone-aware dtype to timezone-"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# dt64tz->dt64 deprecated
|
||||
timezone_frame.astype("datetime64[ns]")
|
||||
|
||||
def test_astype_dt64tz_to_str(self, timezone_frame):
|
||||
# str formatting
|
||||
result = timezone_frame.astype(str)
|
||||
expected = DataFrame(
|
||||
[
|
||||
[
|
||||
"2013-01-01",
|
||||
"2013-01-01 00:00:00-05:00",
|
||||
"2013-01-01 00:00:00+01:00",
|
||||
],
|
||||
["2013-01-02", "NaT", "NaT"],
|
||||
[
|
||||
"2013-01-03",
|
||||
"2013-01-03 00:00:00-05:00",
|
||||
"2013-01-03 00:00:00+01:00",
|
||||
],
|
||||
],
|
||||
columns=timezone_frame.columns,
|
||||
dtype="object",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with option_context("display.max_columns", 20):
|
||||
result = str(timezone_frame)
|
||||
assert (
|
||||
"0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00"
|
||||
) in result
|
||||
assert (
|
||||
"1 2013-01-02 NaT NaT"
|
||||
) in result
|
||||
assert (
|
||||
"2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00"
|
||||
) in result
|
||||
|
||||
def test_astype_empty_dtype_dict(self):
|
||||
# issue mentioned further down in the following issue's thread
|
||||
# https://github.com/pandas-dev/pandas/issues/33113
|
||||
df = DataFrame()
|
||||
result = df.astype({})
|
||||
tm.assert_frame_equal(result, df)
|
||||
assert result is not df
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, dtype",
|
||||
[
|
||||
(["x", "y", "z"], "string[python]"),
|
||||
pytest.param(
|
||||
["x", "y", "z"],
|
||||
"string[pyarrow]",
|
||||
marks=td.skip_if_no("pyarrow"),
|
||||
),
|
||||
(["x", "y", "z"], "category"),
|
||||
(3 * [Timestamp("2020-01-01", tz="UTC")], None),
|
||||
(3 * [Interval(0, 1)], None),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("errors", ["raise", "ignore"])
|
||||
def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
|
||||
# https://github.com/pandas-dev/pandas/issues/35471
|
||||
df = DataFrame(Series(data, dtype=dtype))
|
||||
if errors == "ignore":
|
||||
expected = df
|
||||
result = df.astype(float, errors=errors)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
msg = "(Cannot cast)|(could not convert)"
|
||||
with pytest.raises((ValueError, TypeError), match=msg):
|
||||
df.astype(float, errors=errors)
|
||||
|
||||
def test_astype_tz_conversion(self):
|
||||
# GH 35973
|
||||
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
|
||||
df = DataFrame(val)
|
||||
result = df.astype({"tz": "datetime64[ns, Europe/Berlin]"})
|
||||
|
||||
expected = df
|
||||
expected["tz"] = expected["tz"].dt.tz_convert("Europe/Berlin")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("tz", ["UTC", "Europe/Berlin"])
|
||||
def test_astype_tz_object_conversion(self, tz):
|
||||
# GH 35973
|
||||
val = {"tz": date_range("2020-08-30", freq="d", periods=2, tz="Europe/London")}
|
||||
expected = DataFrame(val)
|
||||
|
||||
# convert expected to object dtype from other tz str (independently tested)
|
||||
result = expected.astype({"tz": f"datetime64[ns, {tz}]"})
|
||||
result = result.astype({"tz": "object"})
|
||||
|
||||
# do real test: object dtype to a specified tz, different from construction tz.
|
||||
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_astype_dt64_to_string(
|
||||
self, frame_or_series, tz_naive_fixture, using_infer_string
|
||||
):
|
||||
# GH#41409
|
||||
tz = tz_naive_fixture
|
||||
|
||||
dti = date_range("2016-01-01", periods=3, tz=tz)
|
||||
dta = dti._data
|
||||
dta[0] = NaT
|
||||
|
||||
obj = frame_or_series(dta)
|
||||
result = obj.astype("string")
|
||||
|
||||
# Check that Series/DataFrame.astype matches DatetimeArray.astype
|
||||
expected = frame_or_series(dta.astype("string"))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
item = result.iloc[0]
|
||||
if frame_or_series is DataFrame:
|
||||
item = item.iloc[0]
|
||||
if using_infer_string:
|
||||
assert item is np.nan
|
||||
else:
|
||||
assert item is pd.NA
|
||||
|
||||
# For non-NA values, we should match what we get for non-EA str
|
||||
alt = obj.astype(str)
|
||||
assert np.all(alt.iloc[1:] == result.iloc[1:])
|
||||
|
||||
def test_astype_td64_to_string(self, frame_or_series):
|
||||
# GH#41409
|
||||
tdi = pd.timedelta_range("1 Day", periods=3)
|
||||
obj = frame_or_series(tdi)
|
||||
|
||||
expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string")
|
||||
result = obj.astype("string")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_astype_bytes(self):
|
||||
# GH#39474
|
||||
result = DataFrame(["foo", "bar", "baz"]).astype(bytes)
|
||||
assert result.dtypes[0] == np.dtype("S3")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_slice",
|
||||
[
|
||||
np.s_[:2, :2],
|
||||
np.s_[:1, :2],
|
||||
np.s_[:2, :1],
|
||||
np.s_[::2, ::2],
|
||||
np.s_[::1, ::2],
|
||||
np.s_[::2, ::1],
|
||||
],
|
||||
)
|
||||
def test_astype_noncontiguous(self, index_slice):
|
||||
# GH#42396
|
||||
data = np.arange(16).reshape(4, 4)
|
||||
df = DataFrame(data)
|
||||
|
||||
result = df.iloc[index_slice].astype("int16")
|
||||
expected = df.iloc[index_slice]
|
||||
tm.assert_frame_equal(result, expected, check_dtype=False)
|
||||
|
||||
def test_astype_retain_attrs(self, any_numpy_dtype):
|
||||
# GH#44414
|
||||
df = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
|
||||
df.attrs["Location"] = "Michigan"
|
||||
|
||||
result = df.astype({"a": any_numpy_dtype}).attrs
|
||||
expected = df.attrs
|
||||
|
||||
tm.assert_dict_equal(expected, result)
|
||||
|
||||
|
||||
class TestAstypeCategorical:
|
||||
def test_astype_from_categorical3(self):
|
||||
df = DataFrame({"cats": [1, 2, 3, 4, 5, 6], "vals": [1, 2, 3, 4, 5, 6]})
|
||||
cats = Categorical([1, 2, 3, 4, 5, 6])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_astype_from_categorical4(self):
|
||||
df = DataFrame(
|
||||
{"cats": ["a", "b", "b", "a", "a", "d"], "vals": [1, 2, 3, 4, 5, 6]}
|
||||
)
|
||||
cats = Categorical(["a", "b", "b", "a", "a", "d"])
|
||||
exp_df = DataFrame({"cats": cats, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
df["cats"] = df["cats"].astype("category")
|
||||
tm.assert_frame_equal(exp_df, df)
|
||||
|
||||
def test_categorical_astype_to_int(self, any_int_dtype):
|
||||
# GH#39402
|
||||
|
||||
df = DataFrame(data={"col1": pd.array([2.0, 1.0, 3.0])})
|
||||
df.col1 = df.col1.astype("category")
|
||||
df.col1 = df.col1.astype(any_int_dtype)
|
||||
expected = DataFrame({"col1": pd.array([2, 1, 3], dtype=any_int_dtype)})
|
||||
tm.assert_frame_equal(df, expected)
|
||||
|
||||
def test_astype_categorical_to_string_missing(self):
|
||||
# https://github.com/pandas-dev/pandas/issues/41797
|
||||
df = DataFrame(["a", "b", np.nan])
|
||||
expected = df.astype(str)
|
||||
cat = df.astype("category")
|
||||
result = cat.astype(str)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class IntegerArrayNoCopy(pd.core.arrays.IntegerArray):
|
||||
# GH 42501
|
||||
|
||||
def copy(self):
|
||||
assert False
|
||||
|
||||
|
||||
class Int16DtypeNoCopy(pd.Int16Dtype):
|
||||
# GH 42501
|
||||
|
||||
@classmethod
|
||||
def construct_array_type(cls):
|
||||
return IntegerArrayNoCopy
|
||||
|
||||
|
||||
def test_frame_astype_no_copy():
|
||||
# GH 42501
|
||||
df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object)
|
||||
result = df.astype({"a": Int16DtypeNoCopy()}, copy=False)
|
||||
|
||||
assert result.a.dtype == pd.Int16Dtype()
|
||||
assert np.shares_memory(df.b.values, result.b.values)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", ["int64", "Int64"])
|
||||
def test_astype_copies(dtype):
|
||||
# GH#50984
|
||||
pytest.importorskip("pyarrow")
|
||||
df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
|
||||
result = df.astype("int64[pyarrow]", copy=True)
|
||||
df.iloc[0, 0] = 100
|
||||
expected = DataFrame({"a": [1, 2, 3]}, dtype="int64[pyarrow]")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("val", [None, 1, 1.5, np.nan, NaT])
|
||||
def test_astype_to_string_not_modifying_input(string_storage, val):
|
||||
# GH#51073
|
||||
df = DataFrame({"a": ["a", "b", val]})
|
||||
expected = df.copy()
|
||||
with option_context("mode.string_storage", string_storage):
|
||||
df.astype("string", copy=False)
|
||||
tm.assert_frame_equal(df, expected)
|
@ -0,0 +1,132 @@
|
||||
from datetime import time
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import pytz
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestAtTime:
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_at_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = frame_or_series(
|
||||
np.random.default_rng(2).standard_normal(len(rng)), index=rng
|
||||
)
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
result = ts_local.at_time(time(10, 0))
|
||||
expected = ts.at_time(time(10, 0)).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_at_time(self, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time(rng[1])
|
||||
assert (rs.index.hour == rng[1].hour).all()
|
||||
assert (rs.index.minute == rng[1].minute).all()
|
||||
assert (rs.index.second == rng[1].second).all()
|
||||
|
||||
result = ts.at_time("9:30")
|
||||
expected = ts.at_time(time(9, 30))
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
def test_at_time_midnight(self, frame_or_series):
|
||||
# midnight, everything
|
||||
rng = date_range("1/1/2000", "1/31/2000")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
result = ts.at_time(time(0, 0))
|
||||
tm.assert_equal(result, ts)
|
||||
|
||||
def test_at_time_nonexistent(self, frame_or_series):
|
||||
# time doesn't exist
|
||||
rng = date_range("1/1/2012", freq="23Min", periods=384)
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal(len(rng)), rng)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
rs = ts.at_time("16:00")
|
||||
assert len(rs) == 0
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"hour", ["1:00", "1:00AM", time(1), time(1, tzinfo=pytz.UTC)]
|
||||
)
|
||||
def test_at_time_errors(self, hour):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
if getattr(hour, "tzinfo", None) is None:
|
||||
result = df.at_time(hour)
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="Index must be timezone"):
|
||||
df.at_time(hour)
|
||||
|
||||
def test_at_time_tz(self):
|
||||
# GH#24043
|
||||
dti = date_range("2018", periods=3, freq="h", tz="US/Pacific")
|
||||
df = DataFrame(list(range(len(dti))), index=dti)
|
||||
result = df.at_time(time(4, tzinfo=pytz.timezone("US/Eastern")))
|
||||
expected = df.iloc[1:2]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.at_time("00:00")
|
||||
|
||||
@pytest.mark.parametrize("axis", ["index", "columns", 0, 1])
|
||||
def test_at_time_axis(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
ts.index, ts.columns = rng, rng
|
||||
|
||||
indices = rng[(rng.hour == 9) & (rng.minute == 30) & (rng.second == 0)]
|
||||
|
||||
if axis in ["index", 0]:
|
||||
expected = ts.loc[indices, :]
|
||||
elif axis in ["columns", 1]:
|
||||
expected = ts.loc[:, indices]
|
||||
|
||||
result = ts.at_time("9:30", axis=axis)
|
||||
|
||||
# Without clearing freq, result has freq 1440T and expected 5T
|
||||
result.index = result.index._with_freq(None)
|
||||
expected.index = expected.index._with_freq(None)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_at_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
akey = time(12, 0, 0)
|
||||
ainds = [24, 72, 120, 168]
|
||||
|
||||
result = df.at_time(akey)
|
||||
expected = df.loc[akey]
|
||||
expected2 = df.iloc[ainds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 4
|
@ -0,0 +1,227 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
time,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import timezones
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestBetweenTime:
|
||||
@td.skip_if_not_us_locale
|
||||
def test_between_time_formats(self, frame_or_series):
|
||||
# GH#11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
strings = [
|
||||
("2:00", "2:30"),
|
||||
("0200", "0230"),
|
||||
("2:00am", "2:30am"),
|
||||
("0200am", "0230am"),
|
||||
("2:00:00", "2:30:00"),
|
||||
("020000", "023000"),
|
||||
("2:00:00am", "2:30:00am"),
|
||||
("020000am", "023000am"),
|
||||
]
|
||||
expected_length = 28
|
||||
|
||||
for time_string in strings:
|
||||
assert len(ts.between_time(*time_string)) == expected_length
|
||||
|
||||
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
|
||||
def test_localized_between_time(self, tzstr, frame_or_series):
|
||||
tz = timezones.maybe_get_tz(tzstr)
|
||||
|
||||
rng = date_range("4/16/2012", "5/1/2012", freq="h")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
ts_local = ts.tz_localize(tzstr)
|
||||
|
||||
t1, t2 = time(10, 0), time(11, 0)
|
||||
result = ts_local.between_time(t1, t2)
|
||||
expected = ts.between_time(t1, t2).tz_localize(tzstr)
|
||||
tm.assert_equal(result, expected)
|
||||
assert timezones.tz_compare(result.index.tz, tz)
|
||||
|
||||
def test_between_time_types(self, frame_or_series):
|
||||
# GH11818
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
obj = DataFrame({"A": 0}, index=rng)
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
obj.between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5))
|
||||
|
||||
def test_between_time(self, inclusive_endpoints_fixture, frame_or_series):
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = inclusive_endpoints_fixture
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = 13 * 4 + 1
|
||||
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 5
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert t >= stime
|
||||
else:
|
||||
assert t > stime
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert t <= etime
|
||||
else:
|
||||
assert t < etime
|
||||
|
||||
result = ts.between_time("00:00", "01:00")
|
||||
expected = ts.between_time(stime, etime)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# across midnight
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
ts = tm.get_obj(ts, frame_or_series)
|
||||
stime = time(22, 0)
|
||||
etime = time(9, 0)
|
||||
|
||||
filtered = ts.between_time(stime, etime, inclusive=inclusive)
|
||||
exp_len = (12 * 11 + 1) * 4 + 1
|
||||
if inclusive in ["right", "neither"]:
|
||||
exp_len -= 4
|
||||
if inclusive in ["left", "neither"]:
|
||||
exp_len -= 4
|
||||
|
||||
assert len(filtered) == exp_len
|
||||
for rs in filtered.index:
|
||||
t = rs.time()
|
||||
if inclusive in ["left", "both"]:
|
||||
assert (t >= stime) or (t <= etime)
|
||||
else:
|
||||
assert (t > stime) or (t <= etime)
|
||||
|
||||
if inclusive in ["right", "both"]:
|
||||
assert (t <= etime) or (t >= stime)
|
||||
else:
|
||||
assert (t < etime) or (t >= stime)
|
||||
|
||||
def test_between_time_raises(self, frame_or_series):
|
||||
# GH#20725
|
||||
obj = DataFrame([[1, 2, 3], [4, 5, 6]])
|
||||
obj = tm.get_obj(obj, frame_or_series)
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex
|
||||
obj.between_time(start_time="00:00", end_time="12:00")
|
||||
|
||||
def test_between_time_axis(self, frame_or_series):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
|
||||
if frame_or_series is DataFrame:
|
||||
ts = ts.to_frame()
|
||||
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
expected_length = 7
|
||||
|
||||
assert len(ts.between_time(stime, etime)) == expected_length
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == expected_length
|
||||
msg = f"No axis named {ts.ndim} for object type {type(ts).__name__}"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, axis=ts.ndim)
|
||||
|
||||
def test_between_time_axis_aliases(self, axis):
|
||||
# GH#8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
ts = DataFrame(np.random.default_rng(2).standard_normal((len(rng), len(rng))))
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
exp_len = 7
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.index = rng
|
||||
assert len(ts.between_time(stime, etime)) == exp_len
|
||||
assert len(ts.between_time(stime, etime, axis=0)) == exp_len
|
||||
|
||||
if axis in ["columns", 1]:
|
||||
ts.columns = rng
|
||||
selected = ts.between_time(stime, etime, axis=1).columns
|
||||
assert len(selected) == exp_len
|
||||
|
||||
def test_between_time_axis_raises(self, axis):
|
||||
# issue 8839
|
||||
rng = date_range("1/1/2000", periods=100, freq="10min")
|
||||
mask = np.arange(0, len(rng))
|
||||
rand_data = np.random.default_rng(2).standard_normal((len(rng), len(rng)))
|
||||
ts = DataFrame(rand_data, index=rng, columns=rng)
|
||||
stime, etime = ("08:00:00", "09:00:00")
|
||||
|
||||
msg = "Index must be DatetimeIndex"
|
||||
if axis in ["columns", 1]:
|
||||
ts.index = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=0)
|
||||
|
||||
if axis in ["index", 0]:
|
||||
ts.columns = mask
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ts.between_time(stime, etime, axis=1)
|
||||
|
||||
def test_between_time_datetimeindex(self):
|
||||
index = date_range("2012-01-01", "2012-01-05", freq="30min")
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(index), 5)), index=index
|
||||
)
|
||||
bkey = slice(time(13, 0, 0), time(14, 0, 0))
|
||||
binds = [26, 27, 28, 74, 75, 76, 122, 123, 124, 170, 171, 172]
|
||||
|
||||
result = df.between_time(bkey.start, bkey.stop)
|
||||
expected = df.loc[bkey]
|
||||
expected2 = df.iloc[binds]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_frame_equal(result, expected2)
|
||||
assert len(result) == 12
|
||||
|
||||
def test_between_time_incorrect_arg_inclusive(self):
|
||||
# GH40245
|
||||
rng = date_range("1/1/2000", "1/5/2000", freq="5min")
|
||||
ts = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 2)), index=rng
|
||||
)
|
||||
|
||||
stime = time(0, 0)
|
||||
etime = time(1, 0)
|
||||
inclusive = "bad_string"
|
||||
msg = "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ts.between_time(stime, etime, inclusive=inclusive)
|
@ -0,0 +1,199 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameClip:
|
||||
def test_clip(self, float_frame):
|
||||
median = float_frame.median().median()
|
||||
original = float_frame.copy()
|
||||
|
||||
double = float_frame.clip(upper=median, lower=median)
|
||||
assert not (double.values != median).any()
|
||||
|
||||
# Verify that float_frame was not changed inplace
|
||||
assert (float_frame.values == original.values).all()
|
||||
|
||||
def test_inplace_clip(self, float_frame):
|
||||
# GH#15388
|
||||
median = float_frame.median().median()
|
||||
frame_copy = float_frame.copy()
|
||||
|
||||
return_value = frame_copy.clip(upper=median, lower=median, inplace=True)
|
||||
assert return_value is None
|
||||
assert not (frame_copy.values != median).any()
|
||||
|
||||
def test_dataframe_clip(self):
|
||||
# GH#2747
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
|
||||
for lb, ub in [(-1, 1), (1, -1)]:
|
||||
clipped_df = df.clip(lb, ub)
|
||||
|
||||
lb, ub = min(lb, ub), max(ub, lb)
|
||||
lb_mask = df.values <= lb
|
||||
ub_mask = df.values >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
assert (clipped_df.values[lb_mask] == lb).all()
|
||||
assert (clipped_df.values[ub_mask] == ub).all()
|
||||
assert (clipped_df.values[mask] == df.values[mask]).all()
|
||||
|
||||
def test_clip_mixed_numeric(self):
|
||||
# clip on mixed integer or floats
|
||||
# GH#24162, clipping now preserves numeric types per column
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [1.0, np.nan, 3.0]})
|
||||
result = df.clip(1, 2)
|
||||
expected = DataFrame({"A": [1, 2, 2], "B": [1.0, np.nan, 2.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame([[1, 2, 3.4], [3, 4, 5.6]], columns=["foo", "bar", "baz"])
|
||||
expected = df.dtypes
|
||||
result = df.clip(upper=3).dtypes
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
def test_clip_against_series(self, inplace):
|
||||
# GH#6966
|
||||
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = Series(np.random.default_rng(2).standard_normal(1000))
|
||||
ub = lb + 1
|
||||
|
||||
original = df.copy()
|
||||
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
|
||||
|
||||
if inplace:
|
||||
clipped_df = df
|
||||
|
||||
for i in range(2):
|
||||
lb_mask = original.iloc[:, i] <= lb
|
||||
ub_mask = original.iloc[:, i] >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
result = clipped_df.loc[lb_mask, i]
|
||||
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
result = clipped_df.loc[ub_mask, i]
|
||||
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
|
||||
assert result.name == i
|
||||
|
||||
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
|
||||
|
||||
@pytest.mark.parametrize("inplace", [True, False])
|
||||
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
|
||||
@pytest.mark.parametrize(
|
||||
"axis,res",
|
||||
[
|
||||
(0, [[2.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 7.0, 7.0]]),
|
||||
(1, [[2.0, 3.0, 4.0], [4.0, 5.0, 6.0], [5.0, 6.0, 7.0]]),
|
||||
],
|
||||
)
|
||||
def test_clip_against_list_like(self, inplace, lower, axis, res):
|
||||
# GH#15390
|
||||
arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
|
||||
|
||||
original = DataFrame(
|
||||
arr, columns=["one", "two", "three"], index=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = original.clip(lower=lower, upper=[5, 6, 7], axis=axis, inplace=inplace)
|
||||
|
||||
expected = DataFrame(res, columns=original.columns, index=original.index)
|
||||
if inplace:
|
||||
result = original
|
||||
tm.assert_frame_equal(result, expected, check_exact=True)
|
||||
|
||||
@pytest.mark.parametrize("axis", [0, 1, None])
|
||||
def test_clip_against_frame(self, axis):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
lb = DataFrame(np.random.default_rng(2).standard_normal((1000, 2)))
|
||||
ub = lb + 1
|
||||
|
||||
clipped_df = df.clip(lb, ub, axis=axis)
|
||||
|
||||
lb_mask = df <= lb
|
||||
ub_mask = df >= ub
|
||||
mask = ~lb_mask & ~ub_mask
|
||||
|
||||
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
|
||||
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
|
||||
tm.assert_frame_equal(clipped_df[mask], df[mask])
|
||||
|
||||
def test_clip_against_unordered_columns(self):
|
||||
# GH#20911
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["A", "B", "C", "D"],
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((1000, 4)),
|
||||
columns=["D", "A", "B", "C"],
|
||||
)
|
||||
df3 = DataFrame(df2.values - 1, columns=["B", "D", "C", "A"])
|
||||
result_upper = df1.clip(lower=0, upper=df2)
|
||||
expected_upper = df1.clip(lower=0, upper=df2[df1.columns])
|
||||
result_lower = df1.clip(lower=df3, upper=3)
|
||||
expected_lower = df1.clip(lower=df3[df1.columns], upper=3)
|
||||
result_lower_upper = df1.clip(lower=df3, upper=df2)
|
||||
expected_lower_upper = df1.clip(lower=df3[df1.columns], upper=df2[df1.columns])
|
||||
tm.assert_frame_equal(result_upper, expected_upper)
|
||||
tm.assert_frame_equal(result_lower, expected_lower)
|
||||
tm.assert_frame_equal(result_lower_upper, expected_lower_upper)
|
||||
|
||||
def test_clip_with_na_args(self, float_frame):
|
||||
"""Should process np.nan argument as None"""
|
||||
# GH#17276
|
||||
tm.assert_frame_equal(float_frame.clip(np.nan), float_frame)
|
||||
tm.assert_frame_equal(float_frame.clip(upper=np.nan, lower=np.nan), float_frame)
|
||||
|
||||
# GH#19992 and adjusted in GH#40420
|
||||
df = DataFrame({"col_0": [1, 2, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]})
|
||||
|
||||
msg = "Downcasting behavior in Series and DataFrame methods 'where'"
|
||||
# TODO: avoid this warning here? seems like we should never be upcasting
|
||||
# in the first place?
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=0)
|
||||
expected = DataFrame(
|
||||
{"col_0": [4, 5, 3], "col_1": [4, 5, 6], "col_2": [7, 8, 9]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.clip(lower=[4, 5, np.nan], axis=1)
|
||||
expected = DataFrame(
|
||||
{"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# GH#40420
|
||||
data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
|
||||
df = DataFrame(data)
|
||||
t = Series([2, -4, np.nan, 6, 3])
|
||||
with tm.assert_produces_warning(FutureWarning, match=msg):
|
||||
result = df.clip(lower=t, axis=0)
|
||||
expected = DataFrame({"col_0": [9, -3, 0, 6, 5], "col_1": [2, -4, 6, 8, 3]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_int_data_with_float_bound(self):
|
||||
# GH51472
|
||||
df = DataFrame({"a": [1, 2, 3]})
|
||||
result = df.clip(lower=1.5)
|
||||
expected = DataFrame({"a": [1.5, 2.0, 3.0]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_clip_with_list_bound(self):
|
||||
# GH#54817
|
||||
df = DataFrame([1, 5])
|
||||
expected = DataFrame([3, 5])
|
||||
result = df.clip([3])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame([1, 3])
|
||||
result = df.clip(upper=[3])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,47 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCombine:
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[
|
||||
pd.date_range("2000", periods=4),
|
||||
pd.date_range("2000", periods=4, tz="US/Central"),
|
||||
pd.period_range("2000", periods=4),
|
||||
pd.timedelta_range(0, periods=4),
|
||||
],
|
||||
)
|
||||
def test_combine_datetlike_udf(self, data):
|
||||
# GH#23079
|
||||
df = pd.DataFrame({"A": data})
|
||||
other = df.copy()
|
||||
df.iloc[1, 0] = None
|
||||
|
||||
def combiner(a, b):
|
||||
return b
|
||||
|
||||
result = df.combine(other, combiner)
|
||||
tm.assert_frame_equal(result, other)
|
||||
|
||||
def test_combine_generic(self, float_frame):
|
||||
df1 = float_frame
|
||||
df2 = float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
combined = df1.combine(df2, np.add)
|
||||
combined2 = df2.combine(df1, np.add)
|
||||
assert combined["D"].isna().all()
|
||||
assert combined2["D"].isna().all()
|
||||
|
||||
chunk = combined.loc[combined.index[:-5], ["A", "B", "C"]]
|
||||
chunk2 = combined2.loc[combined2.index[:-5], ["A", "B", "C"]]
|
||||
|
||||
exp = (
|
||||
float_frame.loc[float_frame.index[:-5], ["A", "B", "C"]].reindex_like(chunk)
|
||||
* 2
|
||||
)
|
||||
tm.assert_frame_equal(chunk, exp)
|
||||
tm.assert_frame_equal(chunk2, exp)
|
@ -0,0 +1,556 @@
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.cast import find_common_type
|
||||
from pandas.core.dtypes.common import is_dtype_equal
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCombineFirst:
|
||||
def test_combine_first_mixed(self):
|
||||
a = Series(["a", "b"], index=range(2))
|
||||
b = Series(range(2), index=range(2))
|
||||
f = DataFrame({"A": a, "B": b})
|
||||
|
||||
a = Series(["a", "b"], index=range(5, 7))
|
||||
b = Series(range(2), index=range(5, 7))
|
||||
g = DataFrame({"A": a, "B": b})
|
||||
|
||||
exp = DataFrame({"A": list("abab"), "B": [0, 1, 0, 1]}, index=[0, 1, 5, 6])
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_frame_equal(combined, exp)
|
||||
|
||||
def test_combine_first(self, float_frame, using_infer_string):
|
||||
# disjoint
|
||||
head, tail = float_frame[:5], float_frame[5:]
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
reordered_frame = float_frame.reindex(combined.index)
|
||||
tm.assert_frame_equal(combined, reordered_frame)
|
||||
tm.assert_index_equal(combined.columns, float_frame.columns)
|
||||
tm.assert_series_equal(combined["A"], reordered_frame["A"])
|
||||
|
||||
# same index
|
||||
fcopy = float_frame.copy()
|
||||
fcopy["A"] = 1
|
||||
del fcopy["C"]
|
||||
|
||||
fcopy2 = float_frame.copy()
|
||||
fcopy2["B"] = 0
|
||||
del fcopy2["D"]
|
||||
|
||||
combined = fcopy.combine_first(fcopy2)
|
||||
|
||||
assert (combined["A"] == 1).all()
|
||||
tm.assert_series_equal(combined["B"], fcopy["B"])
|
||||
tm.assert_series_equal(combined["C"], fcopy2["C"])
|
||||
tm.assert_series_equal(combined["D"], fcopy["D"])
|
||||
|
||||
# overlap
|
||||
head, tail = reordered_frame[:10].copy(), reordered_frame
|
||||
head["A"] = 1
|
||||
|
||||
combined = head.combine_first(tail)
|
||||
assert (combined["A"][:10] == 1).all()
|
||||
|
||||
# reverse overlap
|
||||
tail.iloc[:10, tail.columns.get_loc("A")] = 0
|
||||
combined = tail.combine_first(head)
|
||||
assert (combined["A"][:10] == 0).all()
|
||||
|
||||
# no overlap
|
||||
f = float_frame[:10]
|
||||
g = float_frame[10:]
|
||||
combined = f.combine_first(g)
|
||||
tm.assert_series_equal(combined["A"].reindex(f.index), f["A"])
|
||||
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])
|
||||
|
||||
# corner cases
|
||||
warning = FutureWarning if using_infer_string else None
|
||||
with tm.assert_produces_warning(warning, match="empty entries"):
|
||||
comb = float_frame.combine_first(DataFrame())
|
||||
tm.assert_frame_equal(comb, float_frame)
|
||||
|
||||
comb = DataFrame().combine_first(float_frame)
|
||||
tm.assert_frame_equal(comb, float_frame.sort_index())
|
||||
|
||||
comb = float_frame.combine_first(DataFrame(index=["faz", "boo"]))
|
||||
assert "faz" in comb.index
|
||||
|
||||
# #2525
|
||||
df = DataFrame({"a": [1]}, index=[datetime(2012, 1, 1)])
|
||||
df2 = DataFrame(columns=["b"])
|
||||
result = df.combine_first(df2)
|
||||
assert "b" in result
|
||||
|
||||
def test_combine_first_mixed_bug(self):
|
||||
idx = Index(["a", "b", "c", "e"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "e"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame1 = DataFrame({"col0": ser1, "col2": ser2, "col3": ser3})
|
||||
|
||||
idx = Index(["a", "b", "c", "f"])
|
||||
ser1 = Series([5.0, -9.0, 4.0, 100.0], index=idx)
|
||||
ser2 = Series(["a", "b", "c", "f"], index=idx)
|
||||
ser3 = Series([12, 4, 5, 97], index=idx)
|
||||
|
||||
frame2 = DataFrame({"col1": ser1, "col2": ser2, "col5": ser3})
|
||||
|
||||
combined = frame1.combine_first(frame2)
|
||||
assert len(combined.columns) == 5
|
||||
|
||||
def test_combine_first_same_as_in_update(self):
|
||||
# gh 3016 (same as in update)
|
||||
df = DataFrame(
|
||||
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
|
||||
columns=["A", "B", "bool1", "bool2"],
|
||||
)
|
||||
|
||||
other = DataFrame([[45, 45]], index=[0], columns=["A", "B"])
|
||||
result = df.combine_first(other)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
df.loc[0, "A"] = np.nan
|
||||
result = df.combine_first(other)
|
||||
df.loc[0, "A"] = 45
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_combine_first_doc_example(self):
|
||||
# doc example
|
||||
df1 = DataFrame(
|
||||
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
|
||||
"B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
|
||||
}
|
||||
)
|
||||
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_return_obj_type_with_bools(self):
|
||||
# GH3552
|
||||
|
||||
df1 = DataFrame(
|
||||
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
|
||||
)
|
||||
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])
|
||||
|
||||
expected = Series([True, True, False], name=2, dtype=bool)
|
||||
|
||||
result_12 = df1.combine_first(df2)[2]
|
||||
tm.assert_series_equal(result_12, expected)
|
||||
|
||||
result_21 = df2.combine_first(df1)[2]
|
||||
tm.assert_series_equal(result_21, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data1, data2, data_expected",
|
||||
(
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[pd.NaT, pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
(
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
[datetime(2000, 1, 2), pd.NaT, pd.NaT],
|
||||
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
||||
),
|
||||
),
|
||||
)
|
||||
def test_combine_first_convert_datatime_correctly(
|
||||
self, data1, data2, data_expected
|
||||
):
|
||||
# GH 3593
|
||||
|
||||
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
|
||||
result = df1.combine_first(df2)
|
||||
expected = DataFrame({"a": data_expected})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_combine_first_align_nan(self):
|
||||
# GH 7509 (not fixed)
|
||||
dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"])
|
||||
dfb = DataFrame([[4], [5]], columns=["b"])
|
||||
assert dfa["a"].dtype == "datetime64[ns]"
|
||||
assert dfa["b"].dtype == "int64"
|
||||
|
||||
res = dfa.combine_first(dfb)
|
||||
exp = DataFrame(
|
||||
{"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2, 5]},
|
||||
columns=["a", "b"],
|
||||
)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["a"].dtype == "datetime64[ns]"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
res = dfa.iloc[:0].combine_first(dfb)
|
||||
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
# TODO: this must be datetime64
|
||||
assert res["a"].dtype == "float64"
|
||||
# TODO: this must be int64
|
||||
assert res["b"].dtype == "int64"
|
||||
|
||||
def test_combine_first_timezone(self, unit):
|
||||
# see gh-7630
|
||||
data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit)
|
||||
df1 = DataFrame(
|
||||
columns=["UTCdatetime", "abc"],
|
||||
data=data1,
|
||||
index=pd.date_range("20140627", periods=1),
|
||||
)
|
||||
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit)
|
||||
df2 = DataFrame(
|
||||
columns=["UTCdatetime", "xyz"],
|
||||
data=data2,
|
||||
index=pd.date_range("20140628", periods=1),
|
||||
)
|
||||
res = df2[["UTCdatetime"]].combine_first(df1)
|
||||
exp = DataFrame(
|
||||
{
|
||||
"UTCdatetime": [
|
||||
pd.Timestamp("2010-01-01 01:01", tz="UTC"),
|
||||
pd.Timestamp("2012-12-12 12:12", tz="UTC"),
|
||||
],
|
||||
"abc": [pd.Timestamp("2010-01-01 01:01:00", tz="UTC"), pd.NaT],
|
||||
},
|
||||
columns=["UTCdatetime", "abc"],
|
||||
index=pd.date_range("20140627", periods=2, freq="D"),
|
||||
dtype=f"datetime64[{unit}, UTC]",
|
||||
)
|
||||
assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]"
|
||||
assert res["abc"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_timezone2(self, unit):
|
||||
# see gh-10567
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == f"datetime64[{unit}, UTC]"
|
||||
|
||||
def test_combine_first_timezone3(self, unit):
|
||||
dts1 = pd.DatetimeIndex(
|
||||
["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7])
|
||||
dts2 = pd.DatetimeIndex(
|
||||
["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern"
|
||||
).as_unit(unit)
|
||||
df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.DatetimeIndex(
|
||||
[
|
||||
"2011-01-01",
|
||||
"2012-01-01",
|
||||
"NaT",
|
||||
"2012-01-02",
|
||||
"2011-01-03",
|
||||
"2011-01-04",
|
||||
],
|
||||
tz="US/Eastern",
|
||||
).as_unit(unit)
|
||||
exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
# FIXME: parametrizing over unit breaks on non-nano
|
||||
def test_combine_first_timezone4(self):
|
||||
# different tz
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern")
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-03", "2015-01-05")
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
# if df1 doesn't have NaN, keep its dtype
|
||||
res = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(res, df1)
|
||||
assert res["DATE"].dtype == "datetime64[ns, US/Eastern]"
|
||||
|
||||
def test_combine_first_timezone5(self, unit):
|
||||
dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit)
|
||||
df1 = DataFrame({"DATE": dts1})
|
||||
dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit)
|
||||
df2 = DataFrame({"DATE": dts2})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Timestamp("2015-01-01", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-02", tz="US/Eastern"),
|
||||
pd.Timestamp("2015-01-03"),
|
||||
]
|
||||
exp = DataFrame({"DATE": exp_dts})
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["DATE"].dtype == "object"
|
||||
|
||||
def test_combine_first_timedelta(self):
|
||||
data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"])
|
||||
df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"])
|
||||
df2 = DataFrame({"TD": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.TimedeltaIndex(
|
||||
["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"]
|
||||
)
|
||||
exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["TD"].dtype == "timedelta64[ns]"
|
||||
|
||||
def test_combine_first_period(self):
|
||||
data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M")
|
||||
df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7])
|
||||
data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M")
|
||||
df2 = DataFrame({"P": data2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = pd.PeriodIndex(
|
||||
["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M"
|
||||
)
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == data1.dtype
|
||||
|
||||
# different freq
|
||||
dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D")
|
||||
df2 = DataFrame({"P": dts2}, index=[2, 4, 5])
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp_dts = [
|
||||
pd.Period("2011-01", freq="M"),
|
||||
pd.Period("2012-01-01", freq="D"),
|
||||
pd.NaT,
|
||||
pd.Period("2012-01-02", freq="D"),
|
||||
pd.Period("2011-03", freq="M"),
|
||||
pd.Period("2011-04", freq="M"),
|
||||
]
|
||||
exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7])
|
||||
tm.assert_frame_equal(res, exp)
|
||||
assert res["P"].dtype == "object"
|
||||
|
||||
def test_combine_first_int(self):
|
||||
# GH14687 - integer series that do no align exactly
|
||||
|
||||
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
|
||||
df2 = DataFrame({"a": [1, 4]}, dtype="int64")
|
||||
|
||||
result_12 = df1.combine_first(df2)
|
||||
expected_12 = DataFrame({"a": [0, 1, 3, 5]})
|
||||
tm.assert_frame_equal(result_12, expected_12)
|
||||
|
||||
result_21 = df2.combine_first(df1)
|
||||
expected_21 = DataFrame({"a": [1, 4, 3, 5]})
|
||||
tm.assert_frame_equal(result_21, expected_21)
|
||||
|
||||
@pytest.mark.parametrize("val", [1, 1.0])
|
||||
def test_combine_first_with_asymmetric_other(self, val):
|
||||
# see gh-20699
|
||||
df1 = DataFrame({"isNum": [val]})
|
||||
df2 = DataFrame({"isBool": [True]})
|
||||
|
||||
res = df1.combine_first(df2)
|
||||
exp = DataFrame({"isBool": [True], "isNum": [val]})
|
||||
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
|
||||
# GH: 37519
|
||||
df = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
)
|
||||
df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype=nullable_string_dtype)
|
||||
df.set_index(["a", "b"], inplace=True)
|
||||
df2.set_index(["a", "b"], inplace=True)
|
||||
result = df.combine_first(df2)
|
||||
expected = DataFrame(
|
||||
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype=nullable_string_dtype
|
||||
).set_index(["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"scalar1, scalar2",
|
||||
[
|
||||
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
|
||||
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
|
||||
(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
|
||||
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
|
||||
],
|
||||
)
|
||||
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
|
||||
# GH28481
|
||||
na_value = nulls_fixture
|
||||
|
||||
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
|
||||
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])
|
||||
|
||||
common_dtype = find_common_type([frame.dtypes["b"], other.dtypes["b"]])
|
||||
|
||||
if is_dtype_equal(common_dtype, "object") or frame.dtypes["b"] == other.dtypes["b"]:
|
||||
val = scalar1
|
||||
else:
|
||||
val = na_value
|
||||
|
||||
result = frame.combine_first(other)
|
||||
|
||||
expected = DataFrame([[na_value, val, scalar2]], columns=["a", "b", "c"])
|
||||
|
||||
expected["b"] = expected["b"].astype(common_dtype)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_timestamp_bug_NaT():
|
||||
# GH28481
|
||||
frame = DataFrame([[pd.NaT, pd.NaT]], columns=["a", "b"])
|
||||
other = DataFrame(
|
||||
[[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"]
|
||||
)
|
||||
|
||||
result = frame.combine_first(other)
|
||||
expected = DataFrame(
|
||||
[[pd.NaT, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_with_nan_multiindex():
|
||||
# gh-36562
|
||||
|
||||
mi1 = MultiIndex.from_arrays(
|
||||
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
|
||||
)
|
||||
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
|
||||
mi2 = MultiIndex.from_arrays(
|
||||
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
|
||||
)
|
||||
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
|
||||
res = df.combine_first(DataFrame({"d": s}))
|
||||
mi_expected = MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
|
||||
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
|
||||
],
|
||||
names=["a", "b"],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
|
||||
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
|
||||
},
|
||||
index=mi_expected,
|
||||
)
|
||||
tm.assert_frame_equal(res, expected)
|
||||
|
||||
|
||||
def test_combine_preserve_dtypes():
|
||||
# GH7509
|
||||
a_column = Series(["a", "b"], index=range(2))
|
||||
b_column = Series(range(2), index=range(2))
|
||||
df1 = DataFrame({"A": a_column, "B": b_column})
|
||||
|
||||
c_column = Series(["a", "b"], index=range(5, 7))
|
||||
b_column = Series(range(-1, 1), index=range(5, 7))
|
||||
df2 = DataFrame({"B": b_column, "C": c_column})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"A": ["a", "b", np.nan, np.nan],
|
||||
"B": [0, 1, -1, 0],
|
||||
"C": [np.nan, np.nan, "a", "b"],
|
||||
},
|
||||
index=[0, 1, 5, 6],
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_duplicates_rows_for_nan_index_values():
|
||||
# GH39881
|
||||
df1 = DataFrame(
|
||||
{"x": [9, 10, 11]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 3], [np.nan, 5, 6]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
df2 = DataFrame(
|
||||
{"y": [12, 13, 14]},
|
||||
index=MultiIndex.from_arrays([[1, 2, 4], [np.nan, 5, 7]], names=["a", "b"]),
|
||||
)
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"x": [9.0, 10.0, 11.0, np.nan],
|
||||
"y": [12.0, 13.0, np.nan, 14.0],
|
||||
},
|
||||
index=MultiIndex.from_arrays(
|
||||
[[1, 2, 3, 4], [np.nan, 5, 6, 7]], names=["a", "b"]
|
||||
),
|
||||
)
|
||||
combined = df1.combine_first(df2)
|
||||
tm.assert_frame_equal(combined, expected)
|
||||
|
||||
|
||||
def test_combine_first_int64_not_cast_to_float64():
|
||||
# GH 28613
|
||||
df_1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df_2 = DataFrame({"A": [1, 20, 30], "B": [40, 50, 60], "C": [12, 34, 65]})
|
||||
result = df_1.combine_first(df_2)
|
||||
expected = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [12, 34, 65]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_midx_losing_dtype():
|
||||
# GH#49830
|
||||
midx = MultiIndex.from_arrays([[0, 0], [np.nan, np.nan]])
|
||||
midx2 = MultiIndex.from_arrays([[1, 1], [np.nan, np.nan]])
|
||||
df1 = DataFrame({"a": [None, 4]}, index=midx)
|
||||
df2 = DataFrame({"a": [3, 3]}, index=midx2)
|
||||
result = df1.combine_first(df2)
|
||||
expected_midx = MultiIndex.from_arrays(
|
||||
[[0, 0, 1, 1], [np.nan, np.nan, np.nan, np.nan]]
|
||||
)
|
||||
expected = DataFrame({"a": [np.nan, 4, 3, 3]}, index=expected_midx)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_combine_first_empty_columns():
|
||||
left = DataFrame(columns=["a", "b"])
|
||||
right = DataFrame(columns=["a", "c"])
|
||||
result = left.combine_first(right)
|
||||
expected = DataFrame(columns=["a", "b", "c"])
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,305 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gte1p25
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
|
||||
def test_compare_axis(align_axis):
|
||||
# GH#30429
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis in (1, "columns"):
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
|
||||
columns = pd.Index(["col1", "col3"])
|
||||
expected = pd.DataFrame(
|
||||
[["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"keep_shape, keep_equal",
|
||||
[
|
||||
(True, False),
|
||||
(False, True),
|
||||
(True, True),
|
||||
# False, False case is already covered in test_compare_axis
|
||||
],
|
||||
)
|
||||
def test_compare_various_formats(keep_shape, keep_equal):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = 4.0
|
||||
|
||||
result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
|
||||
|
||||
if keep_shape:
|
||||
indices = pd.Index([0, 1, 2])
|
||||
columns = pd.MultiIndex.from_product(
|
||||
[["col1", "col2", "col3"], ["self", "other"]]
|
||||
)
|
||||
if keep_equal:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", 1.0, 1.0, 1.0, 1.0],
|
||||
["b", "b", 2.0, 2.0, 2.0, 2.0],
|
||||
["c", "c", np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
expected = pd.DataFrame(
|
||||
[
|
||||
["a", "c", np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
[np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
|
||||
],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
else:
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_equal_nulls():
|
||||
# We want to make sure two NaNs are considered the same
|
||||
# and dropped where applicable
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
|
||||
result = df.compare(df2)
|
||||
indices = pd.Index([0])
|
||||
columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
|
||||
expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_with_non_equal_nulls():
|
||||
# We want to make sure the relevant NaNs do not get dropped
|
||||
# even if the entire row or column are NaNs
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
columns=["col1", "col2", "col3"],
|
||||
)
|
||||
df2 = df.copy()
|
||||
df2.loc[0, "col1"] = "c"
|
||||
df2.loc[2, "col3"] = np.nan
|
||||
|
||||
result = df.compare(df2)
|
||||
|
||||
indices = pd.Index([0, 2])
|
||||
columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
|
||||
expected = pd.DataFrame(
|
||||
[["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
|
||||
index=indices,
|
||||
columns=columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("align_axis", [0, 1])
|
||||
def test_compare_multi_index(align_axis):
|
||||
df = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
|
||||
)
|
||||
df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
|
||||
df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
|
||||
|
||||
df2 = df.copy()
|
||||
df2.iloc[0, 0] = "c"
|
||||
df2.iloc[2, 2] = 4.0
|
||||
|
||||
result = df.compare(df2, align_axis=align_axis)
|
||||
|
||||
if align_axis == 0:
|
||||
indices = pd.MultiIndex.from_arrays(
|
||||
[["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
|
||||
)
|
||||
columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
|
||||
data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
|
||||
else:
|
||||
indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
|
||||
columns = pd.MultiIndex.from_arrays(
|
||||
[
|
||||
["a", "a", "b", "b"],
|
||||
["col1", "col1", "col3", "col3"],
|
||||
["self", "other", "self", "other"],
|
||||
]
|
||||
)
|
||||
data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
|
||||
|
||||
expected = pd.DataFrame(data=data, index=indices, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_compare_unaligned_objects():
|
||||
# test DataFrames with different indices
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
|
||||
df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
|
||||
df1.compare(df2)
|
||||
|
||||
# test DataFrames with different shapes
|
||||
msg = (
|
||||
r"Can only compare identically-labeled \(both index and columns\) DataFrame "
|
||||
"objects"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df1 = pd.DataFrame(np.ones((3, 3)))
|
||||
df2 = pd.DataFrame(np.zeros((2, 1)))
|
||||
df1.compare(df2)
|
||||
|
||||
|
||||
def test_compare_result_names():
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
result = df1.compare(df2, result_names=("left", "right"))
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("col1", "left"): {0: "a", 2: np.nan},
|
||||
("col1", "right"): {0: "c", 2: np.nan},
|
||||
("col3", "left"): {0: np.nan, 2: 3.0},
|
||||
("col3", "right"): {0: np.nan, 2: np.nan},
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"result_names",
|
||||
[
|
||||
[1, 2],
|
||||
"HK",
|
||||
{"2": 2, "3": 3},
|
||||
3,
|
||||
3.0,
|
||||
],
|
||||
)
|
||||
def test_invalid_input_result_names(result_names):
|
||||
# GH 44354
|
||||
df1 = pd.DataFrame(
|
||||
{"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
|
||||
)
|
||||
df2 = pd.DataFrame(
|
||||
{
|
||||
"col1": ["c", "b", "c"],
|
||||
"col2": [1.0, 2.0, np.nan],
|
||||
"col3": [1.0, 2.0, np.nan],
|
||||
},
|
||||
)
|
||||
with pytest.raises(
|
||||
TypeError,
|
||||
match=(
|
||||
f"Passing 'result_names' as a {type(result_names)} is not "
|
||||
"supported. Provide 'result_names' as a tuple instead."
|
||||
),
|
||||
):
|
||||
df1.compare(df2, result_names=result_names)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val1,val2",
|
||||
[(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
|
||||
)
|
||||
def test_compare_ea_and_np_dtype(val1, val2):
|
||||
# GH 48966
|
||||
arr = [4.0, val1]
|
||||
ser = pd.Series([1, val2], dtype="Int64")
|
||||
|
||||
df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
|
||||
df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): arr,
|
||||
("a", "other"): ser,
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
if val1 is pd.NA and val2 is pd.NA:
|
||||
# GH#18463 TODO: is this really the desired behavior?
|
||||
expected.loc[1, ("a", "self")] = np.nan
|
||||
|
||||
if val1 is pd.NA and np_version_gte1p25:
|
||||
# can't compare with numpy array if it contains pd.NA
|
||||
with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
else:
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"df1_val,df2_val,diff_self,diff_other",
|
||||
[
|
||||
(4, 3, 4, 3),
|
||||
(4, 4, pd.NA, pd.NA),
|
||||
(4, pd.NA, 4, pd.NA),
|
||||
(pd.NA, pd.NA, pd.NA, pd.NA),
|
||||
],
|
||||
)
|
||||
def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
|
||||
# GH 48966
|
||||
df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
|
||||
df2 = df1.copy()
|
||||
df2.loc[0, "a"] = df2_val
|
||||
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
|
||||
("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
|
||||
("b", "self"): np.nan,
|
||||
("b", "other"): np.nan,
|
||||
}
|
||||
)
|
||||
result = df1.compare(df2, keep_shape=True)
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,202 @@
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestConvertDtypes:
|
||||
@pytest.mark.parametrize(
|
||||
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
|
||||
)
|
||||
def test_convert_dtypes(
|
||||
self, convert_integer, expected, string_storage, using_infer_string
|
||||
):
|
||||
# Specific types are tested in tests/series/test_dtypes.py
|
||||
# Just check that it works for DataFrame here
|
||||
if using_infer_string:
|
||||
string_storage = "pyarrow_numpy"
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
|
||||
}
|
||||
)
|
||||
with pd.option_context("string_storage", string_storage):
|
||||
result = df.convert_dtypes(True, True, convert_integer, False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=expected),
|
||||
"b": pd.Series(["x", "y", "z"], dtype=f"string[{string_storage}]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_empty(self):
|
||||
# Empty DataFrame can pass convert_dtypes, see GH#40393
|
||||
empty_df = pd.DataFrame()
|
||||
tm.assert_frame_equal(empty_df, empty_df.convert_dtypes())
|
||||
|
||||
def test_convert_dtypes_retain_column_names(self):
|
||||
# GH#41435
|
||||
df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
|
||||
df.columns.name = "cols"
|
||||
|
||||
result = df.convert_dtypes()
|
||||
tm.assert_index_equal(result.columns, df.columns)
|
||||
assert result.columns.name == "cols"
|
||||
|
||||
def test_pyarrow_dtype_backend(self):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
|
||||
"b": pd.Series(["x", "y", None], dtype=np.dtype("O")),
|
||||
"c": pd.Series([True, False, None], dtype=np.dtype("O")),
|
||||
"d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
|
||||
"e": pd.Series(pd.date_range("2022", periods=3)),
|
||||
"f": pd.Series(pd.date_range("2022", periods=3, tz="UTC").as_unit("s")),
|
||||
"g": pd.Series(pd.timedelta_range("1D", periods=3)),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, 3], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
|
||||
"e": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="ns"),
|
||||
)
|
||||
),
|
||||
"f": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.datetime(2022, 1, 1),
|
||||
datetime.datetime(2022, 1, 2),
|
||||
datetime.datetime(2022, 1, 3),
|
||||
],
|
||||
type=pa.timestamp(unit="s", tz="UTC"),
|
||||
)
|
||||
),
|
||||
"g": pd.arrays.ArrowExtensionArray(
|
||||
pa.array(
|
||||
[
|
||||
datetime.timedelta(1),
|
||||
datetime.timedelta(2),
|
||||
datetime.timedelta(3),
|
||||
],
|
||||
type=pa.duration("ns"),
|
||||
)
|
||||
),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_already_pyarrow(self):
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_backend_from_pandas_nullable(self):
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"a": pd.Series([1, 2, None], dtype="Int32"),
|
||||
"b": pd.Series(["x", "y", None], dtype="string[python]"),
|
||||
"c": pd.Series([True, False, None], dtype="boolean"),
|
||||
"d": pd.Series([None, 100.5, 200], dtype="Float64"),
|
||||
}
|
||||
)
|
||||
result = df.convert_dtypes(dtype_backend="pyarrow")
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": pd.arrays.ArrowExtensionArray(
|
||||
pa.array([1, 2, None], type=pa.int32())
|
||||
),
|
||||
"b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])),
|
||||
"c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])),
|
||||
"d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_dtype_empty_object(self):
|
||||
# GH 50970
|
||||
pytest.importorskip("pyarrow")
|
||||
expected = pd.DataFrame(columns=[0])
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_pyarrow_engine_lines_false(self):
|
||||
# GH 48893
|
||||
df = pd.DataFrame({"a": [1, 2, 3]})
|
||||
msg = (
|
||||
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
|
||||
"'pyarrow' are allowed."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.convert_dtypes(dtype_backend="numpy")
|
||||
|
||||
def test_pyarrow_backend_no_conversion(self):
|
||||
# GH#52872
|
||||
pytest.importorskip("pyarrow")
|
||||
df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"})
|
||||
expected = df.copy()
|
||||
result = df.convert_dtypes(
|
||||
convert_floating=False,
|
||||
convert_integer=False,
|
||||
convert_boolean=False,
|
||||
convert_string=False,
|
||||
dtype_backend="pyarrow",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_to_np_nullable(self):
|
||||
# GH 53648
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.DataFrame(range(2), dtype="int32[pyarrow]")
|
||||
result = ser.convert_dtypes(dtype_backend="numpy_nullable")
|
||||
expected = pd.DataFrame(range(2), dtype="Int32")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_pyarrow_timestamp(self):
|
||||
# GH 54191
|
||||
pytest.importorskip("pyarrow")
|
||||
ser = pd.Series(pd.date_range("2020-01-01", "2020-01-02", freq="1min"))
|
||||
expected = ser.astype("timestamp[ms][pyarrow]")
|
||||
result = expected.convert_dtypes(dtype_backend="pyarrow")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_convert_dtypes_avoid_block_splitting(self):
|
||||
# GH#55341
|
||||
df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": "a"})
|
||||
result = df.convert_dtypes(convert_integer=False)
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"a": [1, 2, 3],
|
||||
"b": [4, 5, 6],
|
||||
"c": pd.Series(["a"] * 3, dtype="string[python]"),
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result._mgr.nblocks == 2
|
||||
|
||||
def test_convert_dtypes_from_arrow(self):
|
||||
# GH#56581
|
||||
df = pd.DataFrame([["a", datetime.time(18, 12)]], columns=["a", "b"])
|
||||
result = df.convert_dtypes()
|
||||
expected = df.astype({"a": "string[python]"})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,64 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestCopy:
|
||||
@pytest.mark.parametrize("attr", ["index", "columns"])
|
||||
def test_copy_index_name_checking(self, float_frame, attr):
|
||||
# don't want to be able to modify the index stored elsewhere after
|
||||
# making a copy
|
||||
ind = getattr(float_frame, attr)
|
||||
ind.name = None
|
||||
cp = float_frame.copy()
|
||||
getattr(cp, attr).name = "foo"
|
||||
assert getattr(float_frame, attr).name is None
|
||||
|
||||
@td.skip_copy_on_write_invalid_test
|
||||
def test_copy_cache(self):
|
||||
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
|
||||
df = DataFrame({"a": [1]})
|
||||
|
||||
df["x"] = [0]
|
||||
df["a"]
|
||||
|
||||
df.copy()
|
||||
|
||||
df["a"].values[0] = -1
|
||||
|
||||
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]}))
|
||||
|
||||
df["y"] = [0]
|
||||
|
||||
assert df["a"].values[0] == -1
|
||||
tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]}))
|
||||
|
||||
def test_copy(self, float_frame, float_string_frame):
|
||||
cop = float_frame.copy()
|
||||
cop["E"] = cop["A"]
|
||||
assert "E" not in float_frame
|
||||
|
||||
# copy objects
|
||||
copy = float_string_frame.copy()
|
||||
assert copy._mgr is not float_string_frame._mgr
|
||||
|
||||
@td.skip_array_manager_invalid_test
|
||||
def test_copy_consolidates(self):
|
||||
# GH#42477
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
"b": np.random.default_rng(2).integers(0, 100, size=55),
|
||||
}
|
||||
)
|
||||
|
||||
for i in range(10):
|
||||
df.loc[:, f"n_{i}"] = np.random.default_rng(2).integers(0, 100, size=55)
|
||||
|
||||
assert len(df._mgr.blocks) == 11
|
||||
result = df.copy()
|
||||
assert len(result._mgr.blocks) == 1
|
@ -0,0 +1,39 @@
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCount:
|
||||
def test_count(self):
|
||||
# corner case
|
||||
frame = DataFrame()
|
||||
ct1 = frame.count(1)
|
||||
assert isinstance(ct1, Series)
|
||||
|
||||
ct2 = frame.count(0)
|
||||
assert isinstance(ct2, Series)
|
||||
|
||||
# GH#423
|
||||
df = DataFrame(index=range(10))
|
||||
result = df.count(1)
|
||||
expected = Series(0, index=df.index)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame(columns=range(10))
|
||||
result = df.count(0)
|
||||
expected = Series(0, index=df.columns)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df = DataFrame()
|
||||
result = df.count()
|
||||
expected = Series(dtype="int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_count_objects(self, float_string_frame):
|
||||
dm = DataFrame(float_string_frame._series)
|
||||
df = DataFrame(float_string_frame._series)
|
||||
|
||||
tm.assert_series_equal(dm.count(), df.count())
|
||||
tm.assert_series_equal(dm.count(1), df.count(1))
|
@ -0,0 +1,471 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
isna,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameCov:
|
||||
def test_cov(self, float_frame, float_string_frame):
|
||||
# min_periods no NAs (corner case)
|
||||
expected = float_frame.cov()
|
||||
result = float_frame.cov(min_periods=len(float_frame))
|
||||
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
result = float_frame.cov(min_periods=len(float_frame) + 1)
|
||||
assert isna(result.values).all()
|
||||
|
||||
# with NAs
|
||||
frame = float_frame.copy()
|
||||
frame.iloc[:5, frame.columns.get_loc("A")] = np.nan
|
||||
frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan
|
||||
result = frame.cov(min_periods=len(frame) - 8)
|
||||
expected = frame.cov()
|
||||
expected.loc["A", "B"] = np.nan
|
||||
expected.loc["B", "A"] = np.nan
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# regular
|
||||
result = frame.cov()
|
||||
expected = frame["A"].cov(frame["C"])
|
||||
tm.assert_almost_equal(result["A"]["C"], expected)
|
||||
|
||||
# fails on non-numeric types
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.cov()
|
||||
result = float_string_frame.cov(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].cov()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Single column frame
|
||||
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
df.loc[0] = np.nan
|
||||
result = df.cov()
|
||||
expected = DataFrame(
|
||||
np.cov(df.values[1:].T).reshape((1, 1)),
|
||||
index=df.columns,
|
||||
columns=df.columns,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3])
|
||||
def test_cov_ddof(self, test_ddof):
|
||||
# GH#34611
|
||||
np_array1 = np.random.default_rng(2).random(10)
|
||||
np_array2 = np.random.default_rng(2).random(10)
|
||||
df = DataFrame({0: np_array1, 1: np_array2})
|
||||
result = df.cov(ddof=test_ddof)
|
||||
expected_np = np.cov(np_array1, np_array2, ddof=test_ddof)
|
||||
expected = DataFrame(expected_np)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])]
|
||||
)
|
||||
def test_cov_nullable_integer(self, other_column):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
data = DataFrame({"a": pd.array([1, 2, None]), "b": other_column})
|
||||
result = data.cov()
|
||||
arr = np.array([[0.5, 0.5], [0.5, 1.0]])
|
||||
expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_cov_numeric_only(self, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
df = DataFrame({"a": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(0.5, index=["a"], columns=["a"])
|
||||
if numeric_only:
|
||||
result = df.cov(numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.cov(numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorr:
|
||||
# DataFrame.corr(), as opposed to DataFrame.corrwith
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_scipy_method(self, float_frame, method):
|
||||
pytest.importorskip("scipy")
|
||||
float_frame.loc[float_frame.index[:5], "A"] = np.nan
|
||||
float_frame.loc[float_frame.index[5:10], "B"] = np.nan
|
||||
float_frame.loc[float_frame.index[:10], "A"] = float_frame["A"][10:20].copy()
|
||||
|
||||
correls = float_frame.corr(method=method)
|
||||
expected = float_frame["A"].corr(float_frame["C"], method=method)
|
||||
tm.assert_almost_equal(correls["A"]["C"], expected)
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def test_corr_non_numeric(self, float_string_frame):
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
float_string_frame.corr()
|
||||
result = float_string_frame.corr(numeric_only=True)
|
||||
expected = float_string_frame.loc[:, ["A", "B", "C", "D"]].corr()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_nooverlap(self, meth):
|
||||
# nothing in common
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1.5, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
||||
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.loc["A", "B"])
|
||||
assert isna(rs.loc["B", "A"])
|
||||
assert rs.loc["A", "A"] == 1
|
||||
assert rs.loc["B", "B"] == 1
|
||||
assert isna(rs.loc["C", "C"])
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "spearman"])
|
||||
def test_corr_constant(self, meth):
|
||||
# constant --> all NA
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 1, 1, np.nan, np.nan, np.nan],
|
||||
"B": [np.nan, np.nan, np.nan, 1, 1, 1],
|
||||
}
|
||||
)
|
||||
rs = df.corr(meth)
|
||||
assert isna(rs.values).all()
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::RuntimeWarning")
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
def test_corr_int_and_boolean(self, meth):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
||||
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
result = df.corr(meth)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["cov", "corr"])
|
||||
def test_corr_cov_independent_index_column(self, method):
|
||||
# GH#14617
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal(4 * 10).reshape(10, 4),
|
||||
columns=list("abcd"),
|
||||
)
|
||||
result = getattr(df, method)()
|
||||
assert result.index is not result.columns
|
||||
assert result.index.equals(result.columns)
|
||||
|
||||
def test_corr_invalid_method(self):
|
||||
# GH#22298
|
||||
df = DataFrame(np.random.default_rng(2).normal(size=(10, 2)))
|
||||
msg = "method must be either 'pearson', 'spearman', 'kendall', or a callable, "
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.corr(method="____")
|
||||
|
||||
def test_corr_int(self):
|
||||
# dtypes other than float64 GH#1761
|
||||
df = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
||||
|
||||
df.cov()
|
||||
df.corr()
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"nullable_column", [pd.array([1, 2, 3]), pd.array([1, 2, None])]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"other_column",
|
||||
[pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, np.nan])],
|
||||
)
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_nullable_integer(self, nullable_column, other_column, method):
|
||||
# https://github.com/pandas-dev/pandas/issues/33803
|
||||
pytest.importorskip("scipy")
|
||||
data = DataFrame({"a": nullable_column, "b": other_column})
|
||||
result = data.corr(method=method)
|
||||
expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_corr_item_cache(self, using_copy_on_write, warn_copy_on_write):
|
||||
# Check that corr does not lead to incorrect entries in item_cache
|
||||
|
||||
df = DataFrame({"A": range(10)})
|
||||
df["B"] = range(10)[::-1]
|
||||
|
||||
ser = df["A"] # populate item_cache
|
||||
assert len(df._mgr.arrays) == 2 # i.e. 2 blocks
|
||||
|
||||
_ = df.corr(numeric_only=True)
|
||||
|
||||
if using_copy_on_write:
|
||||
ser.iloc[0] = 99
|
||||
assert df.loc[0, "A"] == 0
|
||||
else:
|
||||
# Check that the corr didn't break link between ser and df
|
||||
ser.values[0] = 99
|
||||
assert df.loc[0, "A"] == 99
|
||||
if not warn_copy_on_write:
|
||||
assert df["A"] is ser
|
||||
assert df.values[0, 0] == 99
|
||||
|
||||
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
|
||||
def test_corr_for_constant_columns(self, length):
|
||||
# GH: 37448
|
||||
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
|
||||
result = df.corr()
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_calc_corr_small_numbers(self):
|
||||
# GH: 37452
|
||||
df = DataFrame(
|
||||
{"A": [1.0e-20, 2.0e-20, 3.0e-20], "B": [1.0e-20, 2.0e-20, 3.0e-20]}
|
||||
)
|
||||
result = df.corr()
|
||||
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
|
||||
def test_corr_min_periods_greater_than_length(self, method):
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"A": [1, 2], "B": [1, 2]})
|
||||
result = df.corr(method=method, min_periods=3)
|
||||
expected = DataFrame(
|
||||
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"])
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corr_numeric_only(self, meth, numeric_only):
|
||||
# when dtypes of pandas series are different
|
||||
# then ndarray will have dtype=object,
|
||||
# so it need to be properly handled
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]})
|
||||
expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"])
|
||||
if numeric_only:
|
||||
result = df.corr(meth, numeric_only=numeric_only)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(ValueError, match="could not convert string to float"):
|
||||
df.corr(meth, numeric_only=numeric_only)
|
||||
|
||||
|
||||
class TestDataFrameCorrWith:
|
||||
@pytest.mark.parametrize(
|
||||
"dtype",
|
||||
[
|
||||
"float64",
|
||||
"Float64",
|
||||
pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
|
||||
],
|
||||
)
|
||||
def test_corrwith(self, datetime_frame, dtype):
|
||||
datetime_frame = datetime_frame.astype(dtype)
|
||||
|
||||
a = datetime_frame
|
||||
noise = Series(np.random.default_rng(2).standard_normal(len(a)), index=a.index)
|
||||
|
||||
b = datetime_frame.add(noise, axis=0)
|
||||
|
||||
# make sure order does not matter
|
||||
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
|
||||
del b["B"]
|
||||
|
||||
colcorr = a.corrwith(b, axis=0)
|
||||
tm.assert_almost_equal(colcorr["A"], a["A"].corr(b["A"]))
|
||||
|
||||
rowcorr = a.corrwith(b, axis=1)
|
||||
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
||||
|
||||
dropped = a.corrwith(b, axis=0, drop=True)
|
||||
tm.assert_almost_equal(dropped["A"], a["A"].corr(b["A"]))
|
||||
assert "B" not in dropped
|
||||
|
||||
dropped = a.corrwith(b, axis=1, drop=True)
|
||||
assert a.index[-1] not in dropped.index
|
||||
|
||||
# non time-series data
|
||||
index = ["a", "b", "c", "d", "e"]
|
||||
columns = ["one", "two", "three", "four"]
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 4)),
|
||||
index=index,
|
||||
columns=columns,
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=index[:4],
|
||||
columns=columns,
|
||||
)
|
||||
correls = df1.corrwith(df2, axis=1)
|
||||
for row in index[:4]:
|
||||
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))
|
||||
|
||||
def test_corrwith_with_objects(self, using_infer_string):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = df1.copy()
|
||||
cols = ["A", "B", "C", "D"]
|
||||
|
||||
df1["obj"] = "foo"
|
||||
df2["obj"] = "bar"
|
||||
|
||||
if using_infer_string:
|
||||
import pyarrow as pa
|
||||
|
||||
with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
|
||||
df1.corrwith(df2)
|
||||
else:
|
||||
with pytest.raises(TypeError, match="Could not convert"):
|
||||
df1.corrwith(df2)
|
||||
result = df1.corrwith(df2, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="unsupported operand type"):
|
||||
df1.corrwith(df2, axis=1)
|
||||
result = df1.corrwith(df2, axis=1, numeric_only=True)
|
||||
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_series(self, datetime_frame):
|
||||
result = datetime_frame.corrwith(datetime_frame["A"])
|
||||
expected = datetime_frame.apply(datetime_frame["A"].corr)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_matches_corrcoef(self):
|
||||
df1 = DataFrame(np.arange(10000), columns=["a"])
|
||||
df2 = DataFrame(np.arange(10000) ** 2, columns=["a"])
|
||||
c1 = df1.corrwith(df2)["a"]
|
||||
c2 = np.corrcoef(df1["a"], df2["a"])[0][1]
|
||||
|
||||
tm.assert_almost_equal(c1, c2)
|
||||
assert c1 < 1
|
||||
|
||||
@pytest.mark.parametrize("numeric_only", [True, False])
|
||||
def test_corrwith_mixed_dtypes(self, numeric_only):
|
||||
# GH#18570
|
||||
df = DataFrame(
|
||||
{"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]}
|
||||
)
|
||||
s = Series([0, 6, 7, 3])
|
||||
if numeric_only:
|
||||
result = df.corrwith(s, numeric_only=numeric_only)
|
||||
corrs = [df["a"].corr(s), df["b"].corr(s)]
|
||||
expected = Series(data=corrs, index=["a", "b"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="could not convert string to float",
|
||||
):
|
||||
df.corrwith(s, numeric_only=numeric_only)
|
||||
|
||||
def test_corrwith_index_intersection(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=True).index.sort_values()
|
||||
expected = df1.columns.intersection(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_index_union(self):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 2)), columns=["a", "b"]
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).random(size=(10, 3)), columns=["a", "b", "c"]
|
||||
)
|
||||
|
||||
result = df1.corrwith(df2, drop=False).index.sort_values()
|
||||
expected = df1.columns.union(df2.columns).sort_values()
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
def test_corrwith_dup_cols(self):
|
||||
# GH#21925
|
||||
df1 = DataFrame(np.vstack([np.arange(10)] * 3).T)
|
||||
df2 = df1.copy()
|
||||
df2 = pd.concat((df2, df2[0]), axis=1)
|
||||
|
||||
result = df1.corrwith(df2)
|
||||
expected = Series(np.ones(4), index=[0, 0, 1, 2])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corr_numerical_instabilities(self):
|
||||
# GH#45640
|
||||
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
|
||||
result = df.corr()
|
||||
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
|
||||
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)
|
||||
|
||||
def test_corrwith_spearman(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="spearman")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_kendall(self):
|
||||
# GH#21925
|
||||
pytest.importorskip("scipy")
|
||||
df = DataFrame(np.random.default_rng(2).random(size=(100, 3)))
|
||||
result = df.corrwith(df**2, method="kendall")
|
||||
expected = Series(np.ones(len(result)))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
def test_corrwith_spearman_with_tied_data(self):
|
||||
# GH#48826
|
||||
pytest.importorskip("scipy")
|
||||
df1 = DataFrame(
|
||||
{
|
||||
"A": [1, np.nan, 7, 8],
|
||||
"B": [False, True, True, False],
|
||||
"C": [10, 4, 9, 3],
|
||||
}
|
||||
)
|
||||
df2 = df1[["B", "C"]]
|
||||
result = (df1 + 1).corrwith(df2.B, method="spearman")
|
||||
expected = Series([0.0, 1.0, 0.0], index=["A", "B", "C"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
df_bool = DataFrame(
|
||||
{"A": [True, True, False, False], "B": [True, False, False, True]}
|
||||
)
|
||||
ser_bool = Series([True, True, False, True])
|
||||
result = df_bool.corrwith(ser_bool)
|
||||
expected = Series([0.57735, 0.57735], index=["A", "B"])
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,417 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
class TestDataFrameDescribe:
|
||||
def test_describe_bool_in_mixed_frame(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"string_data": ["a", "b", "c", "d", "e"],
|
||||
"bool_data": [True, True, False, False, False],
|
||||
"int_data": [10, 20, 30, 40, 50],
|
||||
}
|
||||
)
|
||||
|
||||
# Integer data are included in .describe() output,
|
||||
# Boolean and string data are not.
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 30, df.int_data.std(), 10, 20, 30, 40, 50]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Top value is a boolean value that is False
|
||||
result = df.describe(include=["bool"])
|
||||
|
||||
expected = DataFrame(
|
||||
{"bool_data": [5, 2, False, 3]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_empty_object(self):
|
||||
# GH#27183
|
||||
df = DataFrame({"A": [None, None]}, dtype=object)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"A": [0, 0, np.nan, np.nan]},
|
||||
dtype=object,
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = df.iloc[:0].describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_bool_frame(self):
|
||||
# GH#13891
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data_1": [False, False, True, True],
|
||||
"bool_data_2": [False, True, True, True],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data_1": [4, 2, False, 2], "bool_data_2": [4, 2, True, 3]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{
|
||||
"bool_data": [False, False, True, True, False],
|
||||
"int_data": [0, 1, 2, 3, 4],
|
||||
}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"int_data": [5, 2, df.int_data.std(), 0, 1, 2, 3, 4]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df = DataFrame(
|
||||
{"bool_data": [False, False, True, True], "str_data": ["a", "b", "c", "a"]}
|
||||
)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"bool_data": [4, 2, False, 2], "str_data": [4, 3, "a", 2]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_categorical(self):
|
||||
df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 100)})
|
||||
labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
|
||||
cat_labels = Categorical(labels, labels)
|
||||
|
||||
df = df.sort_values(by=["value"], ascending=True)
|
||||
df["value_group"] = pd.cut(
|
||||
df.value, range(0, 10500, 500), right=False, labels=cat_labels
|
||||
)
|
||||
cat = df
|
||||
|
||||
# Categoricals should not show up together with numerical columns
|
||||
result = cat.describe()
|
||||
assert len(result.columns) == 1
|
||||
|
||||
# In a frame, describe() for the cat should be the same as for string
|
||||
# arrays (count, unique, top, freq)
|
||||
|
||||
cat = Categorical(
|
||||
["a", "b", "b", "b"], categories=["a", "b", "c"], ordered=True
|
||||
)
|
||||
s = Series(cat)
|
||||
result = s.describe()
|
||||
expected = Series([4, 2, "b", 3], index=["count", "unique", "top", "freq"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
cat = Series(Categorical(["a", "b", "c", "c"]))
|
||||
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
|
||||
result = df3.describe()
|
||||
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
|
||||
|
||||
def test_describe_empty_categorical_column(self):
|
||||
# GH#26397
|
||||
# Ensure the index of an empty categorical DataFrame column
|
||||
# also contains (count, unique, top, freq)
|
||||
df = DataFrame({"empty_col": Categorical([])})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"empty_col": [0, 0, np.nan, np.nan]},
|
||||
index=["count", "unique", "top", "freq"],
|
||||
dtype="object",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
# ensure NaN, not None
|
||||
assert np.isnan(result.iloc[2, 0])
|
||||
assert np.isnan(result.iloc[3, 0])
|
||||
|
||||
def test_describe_categorical_columns(self):
|
||||
# GH#11558
|
||||
columns = pd.CategoricalIndex(["int1", "int2", "obj"], ordered=True, name="XXX")
|
||||
df = DataFrame(
|
||||
{
|
||||
"int1": [10, 20, 30, 40, 50],
|
||||
"int2": [10, 20, 30, 40, 50],
|
||||
"obj": ["A", 0, None, "X", 1],
|
||||
},
|
||||
columns=columns,
|
||||
)
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.CategoricalIndex(
|
||||
["int1", "int2"],
|
||||
categories=["int1", "int2", "obj"],
|
||||
ordered=True,
|
||||
name="XXX",
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"int1": [5, 30, df.int1.std(), 10, 20, 30, 40, 50],
|
||||
"int2": [5, 30, df.int2.std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
columns=exp_columns,
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
tm.assert_categorical_equal(result.columns.values, expected.columns.values)
|
||||
|
||||
def test_describe_datetime_columns(self):
|
||||
columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01", "2011-03-01"],
|
||||
freq="MS",
|
||||
tz="US/Eastern",
|
||||
name="XXX",
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
0: [10, 20, 30, 40, 50],
|
||||
1: [10, 20, 30, 40, 50],
|
||||
2: ["A", 0, None, "X", 1],
|
||||
}
|
||||
)
|
||||
df.columns = columns
|
||||
result = df.describe()
|
||||
|
||||
exp_columns = pd.DatetimeIndex(
|
||||
["2011-01-01", "2011-02-01"], freq="MS", tz="US/Eastern", name="XXX"
|
||||
)
|
||||
expected = DataFrame(
|
||||
{
|
||||
0: [5, 30, df.iloc[:, 0].std(), 10, 20, 30, 40, 50],
|
||||
1: [5, 30, df.iloc[:, 1].std(), 10, 20, 30, 40, 50],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
expected.columns = exp_columns
|
||||
tm.assert_frame_equal(result, expected)
|
||||
assert result.columns.freq == "MS"
|
||||
assert result.columns.tz == expected.columns.tz
|
||||
|
||||
def test_describe_timedelta_values(self):
|
||||
# GH#6145
|
||||
t1 = pd.timedelta_range("1 days", freq="D", periods=5)
|
||||
t2 = pd.timedelta_range("1 hours", freq="h", periods=5)
|
||||
df = DataFrame({"t1": t1, "t2": t2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"t1": [
|
||||
5,
|
||||
pd.Timedelta("3 days"),
|
||||
df.iloc[:, 0].std(),
|
||||
pd.Timedelta("1 days"),
|
||||
pd.Timedelta("2 days"),
|
||||
pd.Timedelta("3 days"),
|
||||
pd.Timedelta("4 days"),
|
||||
pd.Timedelta("5 days"),
|
||||
],
|
||||
"t2": [
|
||||
5,
|
||||
pd.Timedelta("3 hours"),
|
||||
df.iloc[:, 1].std(),
|
||||
pd.Timedelta("1 hours"),
|
||||
pd.Timedelta("2 hours"),
|
||||
pd.Timedelta("3 hours"),
|
||||
pd.Timedelta("4 hours"),
|
||||
pd.Timedelta("5 hours"),
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
)
|
||||
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
exp_repr = (
|
||||
" t1 t2\n"
|
||||
"count 5 5\n"
|
||||
"mean 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"std 1 days 13:56:50.394919273 0 days 01:34:52.099788303\n"
|
||||
"min 1 days 00:00:00 0 days 01:00:00\n"
|
||||
"25% 2 days 00:00:00 0 days 02:00:00\n"
|
||||
"50% 3 days 00:00:00 0 days 03:00:00\n"
|
||||
"75% 4 days 00:00:00 0 days 04:00:00\n"
|
||||
"max 5 days 00:00:00 0 days 05:00:00"
|
||||
)
|
||||
assert repr(result) == exp_repr
|
||||
|
||||
def test_describe_tz_values(self, tz_naive_fixture):
|
||||
# GH#21332
|
||||
tz = tz_naive_fixture
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"s1": [5, 2, 0, 1, 2, 3, 4, 1.581139],
|
||||
"s2": [
|
||||
5,
|
||||
Timestamp(2018, 1, 3).tz_localize(tz),
|
||||
start.tz_localize(tz),
|
||||
s2[1],
|
||||
s2[2],
|
||||
s2[3],
|
||||
end.tz_localize(tz),
|
||||
np.nan,
|
||||
],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_datetime_is_numeric_includes_datetime(self):
|
||||
df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{
|
||||
"a": [
|
||||
3,
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-01"),
|
||||
Timestamp("2012-01-01T12:00:00"),
|
||||
Timestamp("2012-01-02"),
|
||||
Timestamp("2012-01-02T12:00:00"),
|
||||
Timestamp("2012-01-03"),
|
||||
np.nan,
|
||||
],
|
||||
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
|
||||
},
|
||||
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_tz_values2(self):
|
||||
tz = "CET"
|
||||
s1 = Series(range(5))
|
||||
start = Timestamp(2018, 1, 1)
|
||||
end = Timestamp(2018, 1, 5)
|
||||
s2 = Series(date_range(start, end, tz=tz))
|
||||
df = DataFrame({"s1": s1, "s2": s2})
|
||||
|
||||
s1_ = s1.describe()
|
||||
s2_ = s2.describe()
|
||||
idx = [
|
||||
"count",
|
||||
"mean",
|
||||
"min",
|
||||
"25%",
|
||||
"50%",
|
||||
"75%",
|
||||
"max",
|
||||
"std",
|
||||
]
|
||||
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
|
||||
idx, copy=False
|
||||
)
|
||||
|
||||
result = df.describe(include="all")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_percentiles_integer_idx(self):
|
||||
# GH#26660
|
||||
df = DataFrame({"x": [1]})
|
||||
pct = np.linspace(0, 1, 10 + 1)
|
||||
result = df.describe(percentiles=pct)
|
||||
|
||||
expected = DataFrame(
|
||||
{"x": [1.0, 1.0, np.nan, 1.0, *(1.0 for _ in pct), 1.0]},
|
||||
index=[
|
||||
"count",
|
||||
"mean",
|
||||
"std",
|
||||
"min",
|
||||
"0%",
|
||||
"10%",
|
||||
"20%",
|
||||
"30%",
|
||||
"40%",
|
||||
"50%",
|
||||
"60%",
|
||||
"70%",
|
||||
"80%",
|
||||
"90%",
|
||||
"100%",
|
||||
"max",
|
||||
],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_does_not_raise_error_for_dictlike_elements(self):
|
||||
# GH#32409
|
||||
df = DataFrame([{"test": {"a": "1"}}, {"test": {"a": "2"}}])
|
||||
expected = DataFrame(
|
||||
{"test": [2, 2, {"a": "1"}, 1]}, index=["count", "unique", "top", "freq"]
|
||||
)
|
||||
result = df.describe()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("exclude", ["x", "y", ["x", "y"], ["x", "z"]])
|
||||
def test_describe_when_include_all_exclude_not_allowed(self, exclude):
|
||||
"""
|
||||
When include is 'all', then setting exclude != None is not allowed.
|
||||
"""
|
||||
df = DataFrame({"x": [1], "y": [2], "z": [3]})
|
||||
msg = "exclude must be None when include is 'all'"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.describe(include="all", exclude=exclude)
|
||||
|
||||
def test_describe_with_duplicate_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
||||
columns=["bar", "a", "a"],
|
||||
dtype="float64",
|
||||
)
|
||||
result = df.describe()
|
||||
ser = df.iloc[:, 0].describe()
|
||||
expected = pd.concat([ser, ser, ser], keys=df.columns, axis=1)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_ea_with_na(self, any_numeric_ea_dtype):
|
||||
# GH#48778
|
||||
|
||||
df = DataFrame({"a": [1, pd.NA, pd.NA], "b": pd.NA}, dtype=any_numeric_ea_dtype)
|
||||
result = df.describe()
|
||||
expected = DataFrame(
|
||||
{"a": [1.0, 1.0, pd.NA] + [1.0] * 5, "b": [0.0] + [pd.NA] * 7},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype="Float64",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_describe_exclude_pa_dtype(self):
|
||||
# GH#52570
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int8())),
|
||||
"b": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int16())),
|
||||
"c": Series([1, 2, 3], dtype=pd.ArrowDtype(pa.int32())),
|
||||
}
|
||||
)
|
||||
result = df.describe(
|
||||
include=pd.ArrowDtype(pa.int8()), exclude=pd.ArrowDtype(pa.int32())
|
||||
)
|
||||
expected = DataFrame(
|
||||
{"a": [3, 2, 1, 1, 1.5, 2, 2.5, 3]},
|
||||
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
|
||||
dtype=pd.ArrowDtype(pa.float64()),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user