forked from Alsan/Post_finder
venv
This commit is contained in:
@ -0,0 +1,15 @@
|
||||
import numpy as np
|
||||
|
||||
import pandas as pd
|
||||
|
||||
object_pyarrow_numpy = ("object", "string[pyarrow_numpy]")
|
||||
|
||||
|
||||
def _convert_na_value(ser, expected):
|
||||
if ser.dtype != object:
|
||||
if ser.dtype.storage == "pyarrow_numpy":
|
||||
expected = expected.fillna(np.nan)
|
||||
else:
|
||||
# GH#18463
|
||||
expected = expected.fillna(pd.NA)
|
||||
return expected
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,132 @@
|
||||
import pytest
|
||||
|
||||
from pandas import Series
|
||||
from pandas.core.strings.accessor import StringMethods
|
||||
|
||||
_any_string_method = [
|
||||
("cat", (), {"sep": ","}),
|
||||
("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
|
||||
("center", (10,), {}),
|
||||
("contains", ("a",), {}),
|
||||
("count", ("a",), {}),
|
||||
("decode", ("UTF-8",), {}),
|
||||
("encode", ("UTF-8",), {}),
|
||||
("endswith", ("a",), {}),
|
||||
("endswith", ((),), {}),
|
||||
("endswith", (("a",),), {}),
|
||||
("endswith", (("a", "b"),), {}),
|
||||
("endswith", (("a", "MISSING"),), {}),
|
||||
("endswith", ("a",), {"na": True}),
|
||||
("endswith", ("a",), {"na": False}),
|
||||
("extract", ("([a-z]*)",), {"expand": False}),
|
||||
("extract", ("([a-z]*)",), {"expand": True}),
|
||||
("extractall", ("([a-z]*)",), {}),
|
||||
("find", ("a",), {}),
|
||||
("findall", ("a",), {}),
|
||||
("get", (0,), {}),
|
||||
# because "index" (and "rindex") fail intentionally
|
||||
# if the string is not found, search only for empty string
|
||||
("index", ("",), {}),
|
||||
("join", (",",), {}),
|
||||
("ljust", (10,), {}),
|
||||
("match", ("a",), {}),
|
||||
("fullmatch", ("a",), {}),
|
||||
("normalize", ("NFC",), {}),
|
||||
("pad", (10,), {}),
|
||||
("partition", (" ",), {"expand": False}),
|
||||
("partition", (" ",), {"expand": True}),
|
||||
("repeat", (3,), {}),
|
||||
("replace", ("a", "z"), {}),
|
||||
("rfind", ("a",), {}),
|
||||
("rindex", ("",), {}),
|
||||
("rjust", (10,), {}),
|
||||
("rpartition", (" ",), {"expand": False}),
|
||||
("rpartition", (" ",), {"expand": True}),
|
||||
("slice", (0, 1), {}),
|
||||
("slice_replace", (0, 1, "z"), {}),
|
||||
("split", (" ",), {"expand": False}),
|
||||
("split", (" ",), {"expand": True}),
|
||||
("startswith", ("a",), {}),
|
||||
("startswith", (("a",),), {}),
|
||||
("startswith", (("a", "b"),), {}),
|
||||
("startswith", (("a", "MISSING"),), {}),
|
||||
("startswith", ((),), {}),
|
||||
("startswith", ("a",), {"na": True}),
|
||||
("startswith", ("a",), {"na": False}),
|
||||
("removeprefix", ("a",), {}),
|
||||
("removesuffix", ("a",), {}),
|
||||
# translating unicode points of "a" to "d"
|
||||
("translate", ({97: 100},), {}),
|
||||
("wrap", (2,), {}),
|
||||
("zfill", (10,), {}),
|
||||
] + list(
|
||||
zip(
|
||||
[
|
||||
# methods without positional arguments: zip with empty tuple and empty dict
|
||||
"capitalize",
|
||||
"cat",
|
||||
"get_dummies",
|
||||
"isalnum",
|
||||
"isalpha",
|
||||
"isdecimal",
|
||||
"isdigit",
|
||||
"islower",
|
||||
"isnumeric",
|
||||
"isspace",
|
||||
"istitle",
|
||||
"isupper",
|
||||
"len",
|
||||
"lower",
|
||||
"lstrip",
|
||||
"partition",
|
||||
"rpartition",
|
||||
"rsplit",
|
||||
"rstrip",
|
||||
"slice",
|
||||
"slice_replace",
|
||||
"split",
|
||||
"strip",
|
||||
"swapcase",
|
||||
"title",
|
||||
"upper",
|
||||
"casefold",
|
||||
],
|
||||
[()] * 100,
|
||||
[{}] * 100,
|
||||
)
|
||||
)
|
||||
ids, _, _ = zip(*_any_string_method) # use method name as fixture-id
|
||||
missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids)
|
||||
|
||||
# test that the above list captures all methods of StringMethods
|
||||
assert not missing_methods
|
||||
|
||||
|
||||
@pytest.fixture(params=_any_string_method, ids=ids)
|
||||
def any_string_method(request):
|
||||
"""
|
||||
Fixture for all public methods of `StringMethods`
|
||||
|
||||
This fixture returns a tuple of the method name and sample arguments
|
||||
necessary to call the method.
|
||||
|
||||
Returns
|
||||
-------
|
||||
method_name : str
|
||||
The name of the method in `StringMethods`
|
||||
args : tuple
|
||||
Sample values for the positional arguments
|
||||
kwargs : dict
|
||||
Sample values for the keyword arguments
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> def test_something(any_string_method):
|
||||
... s = Series(['a', 'b', np.nan, 'd'])
|
||||
...
|
||||
... method_name, args, kwargs = any_string_method
|
||||
... method = getattr(s.str, method_name)
|
||||
... # will not raise
|
||||
... method(*args, **kwargs)
|
||||
"""
|
||||
return request.param
|
@ -0,0 +1,198 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalDtype,
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
option_context,
|
||||
)
|
||||
from pandas.core.strings.accessor import StringMethods
|
||||
|
||||
# subset of the full set from pandas/conftest.py
|
||||
_any_allowed_skipna_inferred_dtype = [
|
||||
("string", ["a", np.nan, "c"]),
|
||||
("bytes", [b"a", np.nan, b"c"]),
|
||||
("empty", [np.nan, np.nan, np.nan]),
|
||||
("empty", []),
|
||||
("mixed-integer", ["a", np.nan, 2]),
|
||||
]
|
||||
ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
|
||||
|
||||
|
||||
@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
|
||||
def any_allowed_skipna_inferred_dtype(request):
|
||||
"""
|
||||
Fixture for all (inferred) dtypes allowed in StringMethods.__init__
|
||||
|
||||
The covered (inferred) types are:
|
||||
* 'string'
|
||||
* 'empty'
|
||||
* 'bytes'
|
||||
* 'mixed'
|
||||
* 'mixed-integer'
|
||||
|
||||
Returns
|
||||
-------
|
||||
inferred_dtype : str
|
||||
The string for the inferred dtype from _libs.lib.infer_dtype
|
||||
values : np.ndarray
|
||||
An array of object dtype that will be inferred to have
|
||||
`inferred_dtype`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pandas._libs import lib
|
||||
>>>
|
||||
>>> def test_something(any_allowed_skipna_inferred_dtype):
|
||||
... inferred_dtype, values = any_allowed_skipna_inferred_dtype
|
||||
... # will pass
|
||||
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
|
||||
...
|
||||
... # constructor for .str-accessor will also pass
|
||||
... Series(values).str
|
||||
"""
|
||||
inferred_dtype, values = request.param
|
||||
values = np.array(values, dtype=object) # object dtype to avoid casting
|
||||
|
||||
# correctness of inference tested in tests/dtypes/test_inference.py
|
||||
return inferred_dtype, values
|
||||
|
||||
|
||||
def test_api(any_string_dtype):
|
||||
# GH 6106, GH 9322
|
||||
assert Series.str is StringMethods
|
||||
assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)
|
||||
|
||||
|
||||
def test_api_mi_raises():
|
||||
# GH 23679
|
||||
mi = MultiIndex.from_arrays([["a", "b", "c"]])
|
||||
msg = "Can only use .str accessor with Index, not MultiIndex"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
mi.str
|
||||
assert not hasattr(mi, "str")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [object, "category"])
|
||||
def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
|
||||
# one instance of parametrized fixture
|
||||
box = index_or_series
|
||||
inferred_dtype, values = any_skipna_inferred_dtype
|
||||
|
||||
t = box(values, dtype=dtype) # explicit dtype to avoid casting
|
||||
|
||||
types_passing_constructor = [
|
||||
"string",
|
||||
"unicode",
|
||||
"empty",
|
||||
"bytes",
|
||||
"mixed",
|
||||
"mixed-integer",
|
||||
]
|
||||
if inferred_dtype in types_passing_constructor:
|
||||
# GH 6106
|
||||
assert isinstance(t.str, StringMethods)
|
||||
else:
|
||||
# GH 9184, GH 23011, GH 23163
|
||||
msg = "Can only use .str accessor with string values.*"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
t.str
|
||||
assert not hasattr(t, "str")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [object, "category"])
|
||||
def test_api_per_method(
|
||||
index_or_series,
|
||||
dtype,
|
||||
any_allowed_skipna_inferred_dtype,
|
||||
any_string_method,
|
||||
request,
|
||||
):
|
||||
# this test does not check correctness of the different methods,
|
||||
# just that the methods work on the specified (inferred) dtypes,
|
||||
# and raise on all others
|
||||
box = index_or_series
|
||||
|
||||
# one instance of each parametrized fixture
|
||||
inferred_dtype, values = any_allowed_skipna_inferred_dtype
|
||||
method_name, args, kwargs = any_string_method
|
||||
|
||||
reason = None
|
||||
if box is Index and values.size == 0:
|
||||
if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
|
||||
raises = TypeError
|
||||
reason = "Method cannot deal with empty Index"
|
||||
elif method_name == "split" and kwargs.get("expand", None):
|
||||
raises = TypeError
|
||||
reason = "Split fails on empty Series when expand=True"
|
||||
elif method_name == "get_dummies":
|
||||
raises = ValueError
|
||||
reason = "Need to fortify get_dummies corner cases"
|
||||
|
||||
elif (
|
||||
box is Index
|
||||
and inferred_dtype == "empty"
|
||||
and dtype == object
|
||||
and method_name == "get_dummies"
|
||||
):
|
||||
raises = ValueError
|
||||
reason = "Need to fortify get_dummies corner cases"
|
||||
|
||||
if reason is not None:
|
||||
mark = pytest.mark.xfail(raises=raises, reason=reason)
|
||||
request.applymarker(mark)
|
||||
|
||||
t = box(values, dtype=dtype) # explicit dtype to avoid casting
|
||||
method = getattr(t.str, method_name)
|
||||
|
||||
bytes_allowed = method_name in ["decode", "get", "len", "slice"]
|
||||
# as of v0.23.4, all methods except 'cat' are very lenient with the
|
||||
# allowed data types, just returning NaN for entries that error.
|
||||
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
|
||||
# see discussion in GH 13877
|
||||
mixed_allowed = method_name not in ["cat"]
|
||||
|
||||
allowed_types = (
|
||||
["string", "unicode", "empty"]
|
||||
+ ["bytes"] * bytes_allowed
|
||||
+ ["mixed", "mixed-integer"] * mixed_allowed
|
||||
)
|
||||
|
||||
if inferred_dtype in allowed_types:
|
||||
# xref GH 23555, GH 23556
|
||||
with option_context("future.no_silent_downcasting", True):
|
||||
method(*args, **kwargs) # works!
|
||||
else:
|
||||
# GH 23011, GH 23163
|
||||
msg = (
|
||||
f"Cannot use .str.{method_name} with values of "
|
||||
f"inferred dtype {repr(inferred_dtype)}."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
method(*args, **kwargs)
|
||||
|
||||
|
||||
def test_api_for_categorical(any_string_method, any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/10661
|
||||
s = Series(list("aabb"), dtype=any_string_dtype)
|
||||
s = s + " " + s
|
||||
c = s.astype("category")
|
||||
c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
|
||||
assert isinstance(c.str, StringMethods)
|
||||
|
||||
method_name, args, kwargs = any_string_method
|
||||
|
||||
result = getattr(c.str, method_name)(*args, **kwargs)
|
||||
expected = getattr(s.astype("object").str, method_name)(*args, **kwargs)
|
||||
|
||||
if isinstance(result, DataFrame):
|
||||
tm.assert_frame_equal(result, expected)
|
||||
elif isinstance(result, Series):
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
# str.cat(others=None) returns string, for example
|
||||
assert result == expected
|
@ -0,0 +1,427 @@
|
||||
from datetime import datetime
|
||||
import operator
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Series,
|
||||
_testing as tm,
|
||||
)
|
||||
|
||||
|
||||
def test_title(any_string_dtype):
|
||||
s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
|
||||
result = s.str.title()
|
||||
expected = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_title_mixed_object():
|
||||
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
|
||||
result = s.str.title()
|
||||
expected = Series(
|
||||
["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_almost_equal(result, expected)
|
||||
|
||||
|
||||
def test_lower_upper(any_string_dtype):
|
||||
s = Series(["om", np.nan, "nom", "nom"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.upper()
|
||||
expected = Series(["OM", np.nan, "NOM", "NOM"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = result.str.lower()
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
|
||||
def test_lower_upper_mixed_object():
|
||||
s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
|
||||
|
||||
result = s.str.upper()
|
||||
expected = Series(
|
||||
["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.lower()
|
||||
expected = Series(
|
||||
["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, expected",
|
||||
[
|
||||
(
|
||||
["FOO", "BAR", np.nan, "Blah", "blurg"],
|
||||
["Foo", "Bar", np.nan, "Blah", "Blurg"],
|
||||
),
|
||||
(["a", "b", "c"], ["A", "B", "C"]),
|
||||
(["a b", "a bc. de"], ["A b", "A bc. de"]),
|
||||
],
|
||||
)
|
||||
def test_capitalize(data, expected, any_string_dtype):
|
||||
s = Series(data, dtype=any_string_dtype)
|
||||
result = s.str.capitalize()
|
||||
expected = Series(expected, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_capitalize_mixed_object():
|
||||
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
|
||||
result = s.str.capitalize()
|
||||
expected = Series(
|
||||
["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_swapcase(any_string_dtype):
|
||||
s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
|
||||
result = s.str.swapcase()
|
||||
expected = Series(["foo", "bar", np.nan, "bLAH", "BLURG"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_swapcase_mixed_object():
|
||||
s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0])
|
||||
result = s.str.swapcase()
|
||||
expected = Series(
|
||||
["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_casefold():
|
||||
# GH25405
|
||||
expected = Series(["ss", np.nan, "case", "ssd"])
|
||||
s = Series(["ß", np.nan, "case", "ßd"])
|
||||
result = s.str.casefold()
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_casemethods(any_string_dtype):
|
||||
values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"]
|
||||
s = Series(values, dtype=any_string_dtype)
|
||||
assert s.str.lower().tolist() == [v.lower() for v in values]
|
||||
assert s.str.upper().tolist() == [v.upper() for v in values]
|
||||
assert s.str.title().tolist() == [v.title() for v in values]
|
||||
assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
|
||||
assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
|
||||
|
||||
|
||||
def test_pad(any_string_dtype):
|
||||
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.pad(5, side="left")
|
||||
expected = Series(
|
||||
[" a", " b", np.nan, " c", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="right")
|
||||
expected = Series(
|
||||
["a ", "b ", np.nan, "c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="both")
|
||||
expected = Series(
|
||||
[" a ", " b ", np.nan, " c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_pad_mixed_object():
|
||||
s = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
|
||||
|
||||
result = s.str.pad(5, side="left")
|
||||
expected = Series(
|
||||
[" a", np.nan, " b", np.nan, np.nan, " ee", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="right")
|
||||
expected = Series(
|
||||
["a ", np.nan, "b ", np.nan, np.nan, "ee ", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="both")
|
||||
expected = Series(
|
||||
[" a ", np.nan, " b ", np.nan, np.nan, " ee ", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_pad_fillchar(any_string_dtype):
|
||||
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.pad(5, side="left", fillchar="X")
|
||||
expected = Series(
|
||||
["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="right", fillchar="X")
|
||||
expected = Series(
|
||||
["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.pad(5, side="both", fillchar="X")
|
||||
expected = Series(
|
||||
["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_pad_fillchar_bad_arg_raises(any_string_dtype):
|
||||
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
msg = "fillchar must be a character, not str"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.str.pad(5, fillchar="XY")
|
||||
|
||||
msg = "fillchar must be a character, not int"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.str.pad(5, fillchar=5)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method_name", ["center", "ljust", "rjust", "zfill", "pad"])
|
||||
def test_pad_width_bad_arg_raises(method_name, any_string_dtype):
|
||||
# see gh-13598
|
||||
s = Series(["1", "22", "a", "bb"], dtype=any_string_dtype)
|
||||
op = operator.methodcaller(method_name, "f")
|
||||
|
||||
msg = "width must be of integer type, not str"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
op(s.str)
|
||||
|
||||
|
||||
def test_center_ljust_rjust(any_string_dtype):
|
||||
s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.center(5)
|
||||
expected = Series(
|
||||
[" a ", " b ", np.nan, " c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.ljust(5)
|
||||
expected = Series(
|
||||
["a ", "b ", np.nan, "c ", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.rjust(5)
|
||||
expected = Series(
|
||||
[" a", " b", np.nan, " c", np.nan, "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_center_ljust_rjust_mixed_object():
|
||||
s = Series(["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0])
|
||||
|
||||
result = s.str.center(5)
|
||||
expected = Series(
|
||||
[
|
||||
" a ",
|
||||
np.nan,
|
||||
" b ",
|
||||
np.nan,
|
||||
np.nan,
|
||||
" c ",
|
||||
" eee ",
|
||||
None,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.ljust(5)
|
||||
expected = Series(
|
||||
[
|
||||
"a ",
|
||||
np.nan,
|
||||
"b ",
|
||||
np.nan,
|
||||
np.nan,
|
||||
"c ",
|
||||
"eee ",
|
||||
None,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.rjust(5)
|
||||
expected = Series(
|
||||
[
|
||||
" a",
|
||||
np.nan,
|
||||
" b",
|
||||
np.nan,
|
||||
np.nan,
|
||||
" c",
|
||||
" eee",
|
||||
None,
|
||||
np.nan,
|
||||
np.nan,
|
||||
],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_center_ljust_rjust_fillchar(any_string_dtype):
|
||||
if any_string_dtype == "string[pyarrow_numpy]":
|
||||
pytest.skip(
|
||||
"Arrow logic is different, "
|
||||
"see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126",
|
||||
)
|
||||
s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.center(5, fillchar="X")
|
||||
expected = Series(
|
||||
["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.center(5, "X") for v in np.array(s)], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
||||
|
||||
result = s.str.ljust(5, fillchar="X")
|
||||
expected = Series(
|
||||
["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.ljust(5, "X") for v in np.array(s)], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
||||
|
||||
result = s.str.rjust(5, fillchar="X")
|
||||
expected = Series(
|
||||
["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.rjust(5, "X") for v in np.array(s)], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
||||
|
||||
|
||||
def test_center_ljust_rjust_fillchar_bad_arg_raises(any_string_dtype):
|
||||
s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
|
||||
|
||||
# If fillchar is not a character, normal str raises TypeError
|
||||
# 'aaa'.ljust(5, 'XY')
|
||||
# TypeError: must be char, not str
|
||||
template = "fillchar must be a character, not {dtype}"
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
||||
s.str.center(5, fillchar="XY")
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
||||
s.str.ljust(5, fillchar="XY")
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="str")):
|
||||
s.str.rjust(5, fillchar="XY")
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
||||
s.str.center(5, fillchar=1)
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
||||
s.str.ljust(5, fillchar=1)
|
||||
|
||||
with pytest.raises(TypeError, match=template.format(dtype="int")):
|
||||
s.str.rjust(5, fillchar=1)
|
||||
|
||||
|
||||
def test_zfill(any_string_dtype):
|
||||
s = Series(["1", "22", "aaa", "333", "45678"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.zfill(5)
|
||||
expected = Series(
|
||||
["00001", "00022", "00aaa", "00333", "45678"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.zfill(5) for v in np.array(s)], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
||||
|
||||
result = s.str.zfill(3)
|
||||
expected = Series(["001", "022", "aaa", "333", "45678"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.zfill(3) for v in np.array(s)], dtype=np.object_)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
|
||||
|
||||
s = Series(["1", np.nan, "aaa", np.nan, "45678"], dtype=any_string_dtype)
|
||||
result = s.str.zfill(5)
|
||||
expected = Series(
|
||||
["00001", np.nan, "00aaa", np.nan, "45678"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_wrap(any_string_dtype):
|
||||
# test values are: two words less than width, two words equal to width,
|
||||
# two words greater than width, one word less than width, one word
|
||||
# equal to width, one word greater than width, multiple tokens with
|
||||
# trailing whitespace equal to width
|
||||
s = Series(
|
||||
[
|
||||
"hello world",
|
||||
"hello world!",
|
||||
"hello world!!",
|
||||
"abcdefabcde",
|
||||
"abcdefabcdef",
|
||||
"abcdefabcdefa",
|
||||
"ab ab ab ab ",
|
||||
"ab ab ab ab a",
|
||||
"\t",
|
||||
],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
# expected values
|
||||
expected = Series(
|
||||
[
|
||||
"hello world",
|
||||
"hello world!",
|
||||
"hello\nworld!!",
|
||||
"abcdefabcde",
|
||||
"abcdefabcdef",
|
||||
"abcdefabcdef\na",
|
||||
"ab ab ab ab",
|
||||
"ab ab ab ab\na",
|
||||
"",
|
||||
],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
result = s.str.wrap(12, break_long_words=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_wrap_unicode(any_string_dtype):
|
||||
# test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode
|
||||
s = Series(
|
||||
[" pre ", np.nan, "\xac\u20ac\U00008000 abadcafe"], dtype=any_string_dtype
|
||||
)
|
||||
expected = Series(
|
||||
[" pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"], dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.wrap(6)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,427 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
option_context,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("other", [None, Series, Index])
|
||||
def test_str_cat_name(index_or_series, other):
|
||||
# GH 21053
|
||||
box = index_or_series
|
||||
values = ["a", "b"]
|
||||
if other:
|
||||
other = other(values)
|
||||
else:
|
||||
other = values
|
||||
result = box(values, name="name").str.cat(other, sep=",")
|
||||
assert result.name == "name"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
def test_str_cat(index_or_series, infer_string):
|
||||
with option_context("future.infer_string", infer_string):
|
||||
box = index_or_series
|
||||
# test_cat above tests "str_cat" from ndarray;
|
||||
# here testing "str.cat" from Series/Index to ndarray/list
|
||||
s = box(["a", "a", "b", "b", "c", np.nan])
|
||||
|
||||
# single array
|
||||
result = s.str.cat()
|
||||
expected = "aabbc"
|
||||
assert result == expected
|
||||
|
||||
result = s.str.cat(na_rep="-")
|
||||
expected = "aabbc-"
|
||||
assert result == expected
|
||||
|
||||
result = s.str.cat(sep="_", na_rep="NA")
|
||||
expected = "a_a_b_b_c_NA"
|
||||
assert result == expected
|
||||
|
||||
t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
|
||||
expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
|
||||
|
||||
# Series/Index with array
|
||||
result = s.str.cat(t, na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list
|
||||
result = s.str.cat(list(t), na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"])
|
||||
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(z.values)
|
||||
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(list(z))
|
||||
|
||||
|
||||
def test_str_cat_raises_intuitive_error(index_or_series):
|
||||
# GH 11334
|
||||
box = index_or_series
|
||||
s = box(["a", "b", "c", "d"])
|
||||
message = "Did you mean to supply a `sep` keyword?"
|
||||
with pytest.raises(ValueError, match=message):
|
||||
s.str.cat("|")
|
||||
with pytest.raises(ValueError, match=message):
|
||||
s.str.cat(" ")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("sep", ["", None])
|
||||
@pytest.mark.parametrize("dtype_target", ["object", "category"])
|
||||
@pytest.mark.parametrize("dtype_caller", ["object", "category"])
|
||||
def test_str_cat_categorical(
|
||||
index_or_series, dtype_caller, dtype_target, sep, infer_string
|
||||
):
|
||||
box = index_or_series
|
||||
|
||||
with option_context("future.infer_string", infer_string):
|
||||
s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
|
||||
s = s if box == Index else Series(s, index=s, dtype=s.dtype)
|
||||
t = Index(["b", "a", "b", "c"], dtype=dtype_target)
|
||||
|
||||
expected = Index(
|
||||
["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
|
||||
)
|
||||
expected = (
|
||||
expected
|
||||
if box == Index
|
||||
else Series(
|
||||
expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
|
||||
)
|
||||
)
|
||||
|
||||
# Series/Index with unaligned Index -> t.values
|
||||
result = s.str.cat(t.values, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series having matching Index
|
||||
t = Series(t.values, index=Index(s, dtype=dtype_caller))
|
||||
result = s.str.cat(t, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series.values
|
||||
result = s.str.cat(t.values, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with Series having different Index
|
||||
t = Series(t.values, index=t.values)
|
||||
expected = Index(
|
||||
["aa", "aa", "bb", "bb", "aa"],
|
||||
dtype=object if dtype_caller == "object" else None,
|
||||
)
|
||||
dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
|
||||
expected = (
|
||||
expected
|
||||
if box == Index
|
||||
else Series(
|
||||
expected,
|
||||
index=Index(expected.str[:1], dtype=dtype),
|
||||
dtype=expected.dtype,
|
||||
)
|
||||
)
|
||||
|
||||
result = s.str.cat(t, sep=sep)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data",
|
||||
[[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
|
||||
ids=["integers", "floats", "mixed"],
|
||||
)
|
||||
# without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
|
||||
@pytest.mark.parametrize(
|
||||
"box",
|
||||
[Series, Index, list, lambda x: np.array(x, dtype=object)],
|
||||
ids=["Series", "Index", "list", "np.array"],
|
||||
)
|
||||
def test_str_cat_wrong_dtype_raises(box, data):
|
||||
# GH 22722
|
||||
s = Series(["a", "b", "c"])
|
||||
t = box(data)
|
||||
|
||||
msg = "Concatenation requires list-likes containing only strings.*"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
# need to use outer and na_rep, as otherwise Index would not raise
|
||||
s.str.cat(t, join="outer", na_rep="-")
|
||||
|
||||
|
||||
def test_str_cat_mixed_inputs(index_or_series):
|
||||
box = index_or_series
|
||||
s = Index(["a", "b", "c", "d"])
|
||||
s = s if box == Index else Series(s, index=s)
|
||||
|
||||
t = Series(["A", "B", "C", "D"], index=s.values)
|
||||
d = concat([t, Series(s, index=s)], axis=1)
|
||||
|
||||
expected = Index(["aAa", "bBb", "cCc", "dDd"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
|
||||
# Series/Index with DataFrame
|
||||
result = s.str.cat(d)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with two-dimensional ndarray
|
||||
result = s.str.cat(d.values)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list of Series
|
||||
result = s.str.cat([t, s])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with mixed list of Series/array
|
||||
result = s.str.cat([t, s.values])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with list of Series; different indexes
|
||||
t.index = ["b", "c", "d", "a"]
|
||||
expected = box(["aDa", "bAb", "cBc", "dCd"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
result = s.str.cat([t, s])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with mixed list; different index
|
||||
result = s.str.cat([t, s.values])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# Series/Index with DataFrame; different indexes
|
||||
d.index = ["b", "c", "d", "a"]
|
||||
expected = box(["aDd", "bAa", "cBb", "dCc"])
|
||||
expected = expected if box == Index else Series(expected.values, index=s.values)
|
||||
result = s.str.cat(d)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"])
|
||||
e = concat([z, z], axis=1)
|
||||
|
||||
# two-dimensional ndarray
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(e.values)
|
||||
|
||||
# list of list-likes
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([z.values, s.values])
|
||||
|
||||
# mixed list of Series/list-like
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([z.values, s])
|
||||
|
||||
# errors for incorrect arguments in list-like
|
||||
rgx = "others must be Series, Index, DataFrame,.*"
|
||||
# make sure None/NaN do not crash checks in _get_series_list
|
||||
u = Series(["a", np.nan, "c", None])
|
||||
|
||||
# mix of string and Series
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, "u"])
|
||||
|
||||
# DataFrame in list
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, d])
|
||||
|
||||
# 2-dim ndarray in list
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, d.values])
|
||||
|
||||
# nested lists
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, [u, d]])
|
||||
|
||||
# forbidden input type: set
|
||||
# GH 23009
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(set(u))
|
||||
|
||||
# forbidden input type: set in list
|
||||
# GH 23009
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat([u, set(u)])
|
||||
|
||||
# other forbidden input type, e.g. int
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(1)
|
||||
|
||||
# nested list-likes
|
||||
with pytest.raises(TypeError, match=rgx):
|
||||
s.str.cat(iter([t.values, list(s)]))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
|
||||
def test_str_cat_align_indexed(index_or_series, join):
|
||||
# https://github.com/pandas-dev/pandas/issues/18657
|
||||
box = index_or_series
|
||||
|
||||
s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
|
||||
t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
|
||||
sa, ta = s.align(t, join=join)
|
||||
# result after manual alignment of inputs
|
||||
expected = sa.str.cat(ta, na_rep="-")
|
||||
|
||||
if box == Index:
|
||||
s = Index(s)
|
||||
sa = Index(sa)
|
||||
expected = Index(expected)
|
||||
|
||||
result = s.str.cat(t, join=join, na_rep="-")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
|
||||
def test_str_cat_align_mixed_inputs(join):
|
||||
s = Series(["a", "b", "c", "d"])
|
||||
t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
|
||||
d = concat([t, t], axis=1)
|
||||
|
||||
expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
|
||||
expected = expected_outer.loc[s.index.join(t.index, how=join)]
|
||||
|
||||
# list of Series
|
||||
result = s.str.cat([t, t], join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# DataFrame
|
||||
result = s.str.cat(d, join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# mixed list of indexed/unindexed
|
||||
u = np.array(["A", "B", "C", "D"])
|
||||
expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
|
||||
# joint index of rhs [t, u]; u will be forced have index of s
|
||||
rhs_idx = (
|
||||
t.index.intersection(s.index)
|
||||
if join == "inner"
|
||||
else t.index.union(s.index)
|
||||
if join == "outer"
|
||||
else t.index.append(s.index.difference(t.index))
|
||||
)
|
||||
|
||||
expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
|
||||
result = s.str.cat([t, u], join=join, na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with pytest.raises(TypeError, match="others must be Series,.*"):
|
||||
# nested lists are forbidden
|
||||
s.str.cat([t, list(u)], join=join)
|
||||
|
||||
# errors for incorrect lengths
|
||||
rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
|
||||
z = Series(["1", "2", "3"]).values
|
||||
|
||||
# unindexed object of wrong length
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat(z, join=join)
|
||||
|
||||
# unindexed object of wrong length in list
|
||||
with pytest.raises(ValueError, match=rgx):
|
||||
s.str.cat([t, z], join=join)
|
||||
|
||||
|
||||
def test_str_cat_all_na(index_or_series, index_or_series2):
|
||||
# GH 24044
|
||||
box = index_or_series
|
||||
other = index_or_series2
|
||||
|
||||
# check that all NaNs in caller / target work
|
||||
s = Index(["a", "b", "c", "d"])
|
||||
s = s if box == Index else Series(s, index=s)
|
||||
t = other([np.nan] * 4, dtype=object)
|
||||
# add index of s for alignment
|
||||
t = t if other == Index else Series(t, index=s)
|
||||
|
||||
# all-NA target
|
||||
if box == Series:
|
||||
expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype)
|
||||
else: # box == Index
|
||||
# TODO: Strimg option, this should return string dtype
|
||||
expected = Index([np.nan] * 4, dtype=object)
|
||||
result = s.str.cat(t, join="left")
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
# all-NA caller (only for Series)
|
||||
if other == Series:
|
||||
expected = Series([np.nan] * 4, dtype=object, index=t.index)
|
||||
result = t.str.cat(s, join="left")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_str_cat_special_cases():
|
||||
s = Series(["a", "b", "c", "d"])
|
||||
t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
|
||||
|
||||
# iterator of elements with different types
|
||||
expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
|
||||
result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# right-align with different indexes in others
|
||||
expected = Series(["aa-", "d-d"], index=[0, 3])
|
||||
result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cat_on_filtered_index():
|
||||
df = DataFrame(
|
||||
index=MultiIndex.from_product(
|
||||
[[2011, 2012], [1, 2, 3]], names=["year", "month"]
|
||||
)
|
||||
)
|
||||
|
||||
df = df.reset_index()
|
||||
df = df[df.month > 1]
|
||||
|
||||
str_year = df.year.astype("str")
|
||||
str_month = df.month.astype("str")
|
||||
str_both = str_year.str.cat(str_month, sep=" ")
|
||||
|
||||
assert str_both.loc[1] == "2011 2"
|
||||
|
||||
str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
|
||||
|
||||
assert str_multiple.loc[1] == "2011 2 2"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index])
|
||||
def test_cat_different_classes(klass):
|
||||
# https://github.com/pandas-dev/pandas/issues/33425
|
||||
s = Series(["a", "b", "c"])
|
||||
result = s.str.cat(klass(["x", "y", "z"]))
|
||||
expected = Series(["ax", "by", "cz"])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_cat_on_series_dot_str():
|
||||
# GH 28277
|
||||
ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"])
|
||||
|
||||
message = re.escape(
|
||||
"others must be Series, Index, DataFrame, np.ndarray "
|
||||
"or list-like (either containing only strings or "
|
||||
"containing only objects of type Series/Index/"
|
||||
"np.ndarray[1-dim])"
|
||||
)
|
||||
with pytest.raises(TypeError, match=message):
|
||||
ps.str.cat(others=ps.str)
|
@ -0,0 +1,724 @@
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import ArrowDtype
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype):
|
||||
# TODO: should this raise TypeError
|
||||
values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
|
||||
with pytest.raises(ValueError, match="expand must be True or False"):
|
||||
values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
|
||||
|
||||
|
||||
def test_extract_expand_kwarg(any_string_dtype):
|
||||
s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
|
||||
expected = DataFrame(["BAD__", np.nan, np.nan], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.extract(".*(BAD[_]+).*")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = s.str.extract(".*(BAD[_]+).*", expand=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = DataFrame(
|
||||
[["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_False_mixed_object():
|
||||
ser = Series(
|
||||
["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0]
|
||||
)
|
||||
|
||||
# two groups
|
||||
result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
|
||||
er = [np.nan, np.nan] # empty row
|
||||
expected = DataFrame(
|
||||
[["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# single group
|
||||
result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False)
|
||||
expected = Series(
|
||||
["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_index_raises():
|
||||
# GH9980
|
||||
# Index only works with one regex group since
|
||||
# multi-group would expand to a frame
|
||||
idx = Index(["A1", "A2", "A3", "A4", "B5"])
|
||||
msg = "only one regex group is supported with Index"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
idx.str.extract("([AB])([123])", expand=False)
|
||||
|
||||
|
||||
def test_extract_expand_no_capture_groups_raises(index_or_series, any_string_dtype):
|
||||
s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
|
||||
msg = "pattern contains no capture groups"
|
||||
|
||||
# no groups
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s_or_idx.str.extract("[ABC][123]", expand=False)
|
||||
|
||||
# only non-capturing groups
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s_or_idx.str.extract("(?:[AB]).*", expand=False)
|
||||
|
||||
|
||||
def test_extract_expand_single_capture_group(index_or_series, any_string_dtype):
|
||||
# single group renames series/index properly
|
||||
s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
|
||||
result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
|
||||
|
||||
expected = index_or_series(["A", "A"], name="uno", dtype=any_string_dtype)
|
||||
if index_or_series == Series:
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_capture_groups(any_string_dtype):
|
||||
s = Series(["A1", "B2", "C3"], dtype=any_string_dtype)
|
||||
# one group, no matches
|
||||
result = s.str.extract("(_)", expand=False)
|
||||
expected = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# two groups, no matches
|
||||
result = s.str.extract("(_)(_)", expand=False)
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one group, some matches
|
||||
result = s.str.extract("([AB])[123]", expand=False)
|
||||
expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# two groups, some matches
|
||||
result = s.str.extract("([AB])([123])", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one named group
|
||||
result = s.str.extract("(?P<letter>[AB])", expand=False)
|
||||
expected = Series(["A", "B", np.nan], name="letter", dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# two named groups
|
||||
result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mix named and unnamed groups
|
||||
result = s.str.extract("([AB])(?P<number>[123])", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]],
|
||||
columns=[0, "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one normal group, one non-capturing group
|
||||
result = s.str.extract("([AB])(?:[123])", expand=False)
|
||||
expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# two normal groups, one non-capturing group
|
||||
s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
|
||||
result = s.str.extract("([AB])([123])(?:[123])", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one optional group followed by one normal group
|
||||
s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
|
||||
result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, "3"]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one normal group followed by one optional group
|
||||
s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
|
||||
result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], ["C", np.nan]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_capture_groups_index(index, any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/6348
|
||||
# not passing index to the extractor
|
||||
data = ["A1", "B2", "C"]
|
||||
|
||||
if len(index) == 0:
|
||||
pytest.skip("Test requires len(index) > 0")
|
||||
while len(index) < len(data):
|
||||
index = index.repeat(2)
|
||||
|
||||
index = index[: len(data)]
|
||||
ser = Series(data, index=index, dtype=any_string_dtype)
|
||||
|
||||
result = ser.str.extract(r"(\d)", expand=False)
|
||||
expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], ["C", np.nan]],
|
||||
columns=["letter", "number"],
|
||||
index=index,
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_single_series_name_is_preserved(any_string_dtype):
|
||||
s = Series(["a3", "b3", "c2"], name="bob", dtype=any_string_dtype)
|
||||
result = s.str.extract(r"(?P<sue>[a-z])", expand=False)
|
||||
expected = Series(["a", "b", "c"], name="sue", dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_True(any_string_dtype):
|
||||
# Contains tests like those in test_match and some others.
|
||||
s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
|
||||
expected = DataFrame(
|
||||
[["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_True_mixed_object():
|
||||
er = [np.nan, np.nan] # empty row
|
||||
mixed = Series(
|
||||
[
|
||||
"aBAD_BAD",
|
||||
np.nan,
|
||||
"BAD_b_BAD",
|
||||
True,
|
||||
datetime.today(),
|
||||
"foo",
|
||||
None,
|
||||
1,
|
||||
2.0,
|
||||
]
|
||||
)
|
||||
|
||||
result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
|
||||
expected = DataFrame(
|
||||
[["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_expand_True_single_capture_group_raises(
|
||||
index_or_series, any_string_dtype
|
||||
):
|
||||
# these should work for both Series and Index
|
||||
# no groups
|
||||
s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
|
||||
msg = "pattern contains no capture groups"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s_or_idx.str.extract("[ABC][123]", expand=True)
|
||||
|
||||
# only non-capturing groups
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
s_or_idx.str.extract("(?:[AB]).*", expand=True)
|
||||
|
||||
|
||||
def test_extract_expand_True_single_capture_group(index_or_series, any_string_dtype):
|
||||
# single group renames series/index properly
|
||||
s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
|
||||
result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
|
||||
expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name", [None, "series_name"])
|
||||
def test_extract_series(name, any_string_dtype):
|
||||
# extract should give the same result whether or not the series has a name.
|
||||
s = Series(["A1", "B2", "C3"], name=name, dtype=any_string_dtype)
|
||||
|
||||
# one group, no matches
|
||||
result = s.str.extract("(_)", expand=True)
|
||||
expected = DataFrame([np.nan, np.nan, np.nan], dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# two groups, no matches
|
||||
result = s.str.extract("(_)(_)", expand=True)
|
||||
expected = DataFrame(
|
||||
[[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one group, some matches
|
||||
result = s.str.extract("([AB])[123]", expand=True)
|
||||
expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# two groups, some matches
|
||||
result = s.str.extract("([AB])([123])", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one named group
|
||||
result = s.str.extract("(?P<letter>[AB])", expand=True)
|
||||
expected = DataFrame({"letter": ["A", "B", np.nan]}, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# two named groups
|
||||
result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# mix named and unnamed groups
|
||||
result = s.str.extract("([AB])(?P<number>[123])", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]],
|
||||
columns=[0, "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one normal group, one non-capturing group
|
||||
result = s.str.extract("([AB])(?:[123])", expand=True)
|
||||
expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_optional_groups(any_string_dtype):
|
||||
# two normal groups, one non-capturing group
|
||||
s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
|
||||
result = s.str.extract("([AB])([123])(?:[123])", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one optional group followed by one normal group
|
||||
s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
|
||||
result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], [np.nan, "3"]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one normal group followed by one optional group
|
||||
s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
|
||||
result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], ["C", np.nan]],
|
||||
columns=["letter", "number"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_dataframe_capture_groups_index(index, any_string_dtype):
|
||||
# GH6348
|
||||
# not passing index to the extractor
|
||||
|
||||
data = ["A1", "B2", "C"]
|
||||
|
||||
if len(index) < len(data):
|
||||
pytest.skip(f"Index needs more than {len(data)} values")
|
||||
|
||||
index = index[: len(data)]
|
||||
s = Series(data, index=index, dtype=any_string_dtype)
|
||||
|
||||
result = s.str.extract(r"(\d)", expand=True)
|
||||
expected = DataFrame(["1", "2", np.nan], index=index, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=True)
|
||||
expected = DataFrame(
|
||||
[["A", "1"], ["B", "2"], ["C", np.nan]],
|
||||
columns=["letter", "number"],
|
||||
index=index,
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extract_single_group_returns_frame(any_string_dtype):
|
||||
# GH11386 extract should always return DataFrame, even when
|
||||
# there is only one group. Prior to v0.18.0, extract returned
|
||||
# Series when there was only one group in the regex.
|
||||
s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
|
||||
result = s.str.extract(r"(?P<letter>[a-z])", expand=True)
|
||||
expected = DataFrame({"letter": ["a", "b", "c"]}, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extractall(any_string_dtype):
|
||||
data = [
|
||||
"dave@google.com",
|
||||
"tdhock5@gmail.com",
|
||||
"maudelaperriere@gmail.com",
|
||||
"rob@gmail.com some text steve@gmail.com",
|
||||
"a@b.com some text c@d.com and e@f.com",
|
||||
np.nan,
|
||||
"",
|
||||
]
|
||||
expected_tuples = [
|
||||
("dave", "google", "com"),
|
||||
("tdhock5", "gmail", "com"),
|
||||
("maudelaperriere", "gmail", "com"),
|
||||
("rob", "gmail", "com"),
|
||||
("steve", "gmail", "com"),
|
||||
("a", "b", "com"),
|
||||
("c", "d", "com"),
|
||||
("e", "f", "com"),
|
||||
]
|
||||
pat = r"""
|
||||
(?P<user>[a-z0-9]+)
|
||||
@
|
||||
(?P<domain>[a-z]+)
|
||||
\.
|
||||
(?P<tld>[a-z]{2,4})
|
||||
"""
|
||||
expected_columns = ["user", "domain", "tld"]
|
||||
s = Series(data, dtype=any_string_dtype)
|
||||
# extractall should return a DataFrame with one row for each match, indexed by the
|
||||
# subject from which the match came.
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
|
||||
names=(None, "match"),
|
||||
)
|
||||
expected = DataFrame(
|
||||
expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.extractall(pat, flags=re.VERBOSE)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# The index of the input Series should be used to construct the index of the output
|
||||
# DataFrame:
|
||||
mi = MultiIndex.from_tuples(
|
||||
[
|
||||
("single", "Dave"),
|
||||
("single", "Toby"),
|
||||
("single", "Maude"),
|
||||
("multiple", "robAndSteve"),
|
||||
("multiple", "abcdef"),
|
||||
("none", "missing"),
|
||||
("none", "empty"),
|
||||
]
|
||||
)
|
||||
s = Series(data, index=mi, dtype=any_string_dtype)
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[
|
||||
("single", "Dave", 0),
|
||||
("single", "Toby", 0),
|
||||
("single", "Maude", 0),
|
||||
("multiple", "robAndSteve", 0),
|
||||
("multiple", "robAndSteve", 1),
|
||||
("multiple", "abcdef", 0),
|
||||
("multiple", "abcdef", 1),
|
||||
("multiple", "abcdef", 2),
|
||||
],
|
||||
names=(None, None, "match"),
|
||||
)
|
||||
expected = DataFrame(
|
||||
expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.extractall(pat, flags=re.VERBOSE)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# MultiIndexed subject with names.
|
||||
s = Series(data, index=mi, dtype=any_string_dtype)
|
||||
s.index.names = ("matches", "description")
|
||||
expected_index.names = ("matches", "description", "match")
|
||||
expected = DataFrame(
|
||||
expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.extractall(pat, flags=re.VERBOSE)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"pat,expected_names",
|
||||
[
|
||||
# optional groups.
|
||||
("(?P<letter>[AB])?(?P<number>[123])", ["letter", "number"]),
|
||||
# only one of two groups has a name.
|
||||
("([AB])?(?P<number>[123])", [0, "number"]),
|
||||
],
|
||||
)
|
||||
def test_extractall_column_names(pat, expected_names, any_string_dtype):
|
||||
s = Series(["", "A1", "32"], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.extractall(pat)
|
||||
expected = DataFrame(
|
||||
[("A", "1"), (np.nan, "3"), (np.nan, "2")],
|
||||
index=MultiIndex.from_tuples([(1, 0), (2, 0), (2, 1)], names=(None, "match")),
|
||||
columns=expected_names,
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extractall_single_group(any_string_dtype):
|
||||
s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
|
||||
expected_index = MultiIndex.from_tuples(
|
||||
[(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
|
||||
)
|
||||
|
||||
# extractall(one named group) returns DataFrame with one named column.
|
||||
result = s.str.extractall(r"(?P<letter>[a-z])")
|
||||
expected = DataFrame(
|
||||
{"letter": ["a", "b", "d", "c"]}, index=expected_index, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# extractall(one un-named group) returns DataFrame with one un-named column.
|
||||
result = s.str.extractall(r"([a-z])")
|
||||
expected = DataFrame(
|
||||
["a", "b", "d", "c"], index=expected_index, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extractall_single_group_with_quantifier(any_string_dtype):
|
||||
# GH#13382
|
||||
# extractall(one un-named group with quantifier) returns DataFrame with one un-named
|
||||
# column.
|
||||
s = Series(["ab3", "abc3", "d4cd2"], name="series_name", dtype=any_string_dtype)
|
||||
result = s.str.extractall(r"([a-z]+)")
|
||||
expected = DataFrame(
|
||||
["ab", "abc", "d", "cd"],
|
||||
index=MultiIndex.from_tuples(
|
||||
[(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
|
||||
),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, names",
|
||||
[
|
||||
([], (None,)),
|
||||
([], ("i1",)),
|
||||
([], (None, "i2")),
|
||||
([], ("i1", "i2")),
|
||||
(["a3", "b3", "d4c2"], (None,)),
|
||||
(["a3", "b3", "d4c2"], ("i1", "i2")),
|
||||
(["a3", "b3", "d4c2"], (None, "i2")),
|
||||
(["a3", "b3", "d4c2"], ("i1", "i2")),
|
||||
],
|
||||
)
|
||||
def test_extractall_no_matches(data, names, any_string_dtype):
|
||||
# GH19075 extractall with no matches should return a valid MultiIndex
|
||||
n = len(data)
|
||||
if len(names) == 1:
|
||||
index = Index(range(n), name=names[0])
|
||||
else:
|
||||
tuples = (tuple([i] * (n - 1)) for i in range(n))
|
||||
index = MultiIndex.from_tuples(tuples, names=names)
|
||||
s = Series(data, name="series_name", index=index, dtype=any_string_dtype)
|
||||
expected_index = MultiIndex.from_tuples([], names=(names + ("match",)))
|
||||
|
||||
# one un-named group.
|
||||
result = s.str.extractall("(z)")
|
||||
expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# two un-named groups.
|
||||
result = s.str.extractall("(z)(z)")
|
||||
expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one named group.
|
||||
result = s.str.extractall("(?P<first>z)")
|
||||
expected = DataFrame(
|
||||
columns=["first"], index=expected_index, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# two named groups.
|
||||
result = s.str.extractall("(?P<first>z)(?P<second>z)")
|
||||
expected = DataFrame(
|
||||
columns=["first", "second"], index=expected_index, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# one named, one un-named.
|
||||
result = s.str.extractall("(z)(?P<second>z)")
|
||||
expected = DataFrame(
|
||||
columns=[0, "second"], index=expected_index, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extractall_stringindex(any_string_dtype):
|
||||
s = Series(["a1a2", "b1", "c1"], name="xxx", dtype=any_string_dtype)
|
||||
result = s.str.extractall(r"[ab](?P<digit>\d)")
|
||||
expected = DataFrame(
|
||||
{"digit": ["1", "2", "1"]},
|
||||
index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, "match"]),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# index should return the same result as the default index without name thus
|
||||
# index.name doesn't affect to the result
|
||||
if any_string_dtype == "object":
|
||||
for idx in [
|
||||
Index(["a1a2", "b1", "c1"], dtype=object),
|
||||
Index(["a1a2", "b1", "c1"], name="xxx", dtype=object),
|
||||
]:
|
||||
result = idx.str.extractall(r"[ab](?P<digit>\d)")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
s = Series(
|
||||
["a1a2", "b1", "c1"],
|
||||
name="s_name",
|
||||
index=Index(["XX", "yy", "zz"], name="idx_name"),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
result = s.str.extractall(r"[ab](?P<digit>\d)")
|
||||
expected = DataFrame(
|
||||
{"digit": ["1", "2", "1"]},
|
||||
index=MultiIndex.from_tuples(
|
||||
[("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
|
||||
),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_extractall_no_capture_groups_raises(any_string_dtype):
|
||||
# Does not make sense to use extractall with a regex that has no capture groups.
|
||||
# (it returns DataFrame with one column for each capture group)
|
||||
s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
|
||||
with pytest.raises(ValueError, match="no capture groups"):
|
||||
s.str.extractall(r"[a-z]")
|
||||
|
||||
|
||||
def test_extract_index_one_two_groups():
|
||||
s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
|
||||
r = s.index.str.extract(r"([A-Z])", expand=True)
|
||||
e = DataFrame(["A", "B", "D"])
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
# Prior to v0.18.0, index.str.extract(regex with one group)
|
||||
# returned Index. With more than one group, extract raised an
|
||||
# error (GH9980). Now extract always returns DataFrame.
|
||||
r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
|
||||
e_list = [("A", "3"), ("B", "3"), ("D", "4")]
|
||||
e = DataFrame(e_list, columns=["letter", "digit"])
|
||||
tm.assert_frame_equal(r, e)
|
||||
|
||||
|
||||
def test_extractall_same_as_extract(any_string_dtype):
|
||||
s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
|
||||
|
||||
pattern_two_noname = r"([a-z])([0-9])"
|
||||
extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
|
||||
has_multi_index = s.str.extractall(pattern_two_noname)
|
||||
no_multi_index = has_multi_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_two_noname, no_multi_index)
|
||||
|
||||
pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
|
||||
extract_two_named = s.str.extract(pattern_two_named, expand=True)
|
||||
has_multi_index = s.str.extractall(pattern_two_named)
|
||||
no_multi_index = has_multi_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_two_named, no_multi_index)
|
||||
|
||||
pattern_one_named = r"(?P<group_name>[a-z])"
|
||||
extract_one_named = s.str.extract(pattern_one_named, expand=True)
|
||||
has_multi_index = s.str.extractall(pattern_one_named)
|
||||
no_multi_index = has_multi_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_one_named, no_multi_index)
|
||||
|
||||
pattern_one_noname = r"([a-z])"
|
||||
extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
|
||||
has_multi_index = s.str.extractall(pattern_one_noname)
|
||||
no_multi_index = has_multi_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_one_noname, no_multi_index)
|
||||
|
||||
|
||||
def test_extractall_same_as_extract_subject_index(any_string_dtype):
|
||||
# same as above tests, but s has an MultiIndex.
|
||||
mi = MultiIndex.from_tuples(
|
||||
[("A", "first"), ("B", "second"), ("C", "third")],
|
||||
names=("capital", "ordinal"),
|
||||
)
|
||||
s = Series(["a3", "b3", "c2"], index=mi, name="series_name", dtype=any_string_dtype)
|
||||
|
||||
pattern_two_noname = r"([a-z])([0-9])"
|
||||
extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
|
||||
has_match_index = s.str.extractall(pattern_two_noname)
|
||||
no_match_index = has_match_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_two_noname, no_match_index)
|
||||
|
||||
pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
|
||||
extract_two_named = s.str.extract(pattern_two_named, expand=True)
|
||||
has_match_index = s.str.extractall(pattern_two_named)
|
||||
no_match_index = has_match_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_two_named, no_match_index)
|
||||
|
||||
pattern_one_named = r"(?P<group_name>[a-z])"
|
||||
extract_one_named = s.str.extract(pattern_one_named, expand=True)
|
||||
has_match_index = s.str.extractall(pattern_one_named)
|
||||
no_match_index = has_match_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_one_named, no_match_index)
|
||||
|
||||
pattern_one_noname = r"([a-z])"
|
||||
extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
|
||||
has_match_index = s.str.extractall(pattern_one_noname)
|
||||
no_match_index = has_match_index.xs(0, level="match")
|
||||
tm.assert_frame_equal(extract_one_noname, no_match_index)
|
||||
|
||||
|
||||
def test_extractall_preserves_dtype():
|
||||
# Ensure that when extractall is called on a series with specific dtypes set, that
|
||||
# the dtype is preserved in the resulting DataFrame's column.
|
||||
pa = pytest.importorskip("pyarrow")
|
||||
|
||||
result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)")
|
||||
assert result.dtypes[0] == "string[pyarrow]"
|
@ -0,0 +1,972 @@
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.errors import PerformanceWarning
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
Series,
|
||||
_testing as tm,
|
||||
)
|
||||
from pandas.tests.strings import (
|
||||
_convert_na_value,
|
||||
object_pyarrow_numpy,
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.contains
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def using_pyarrow(dtype):
|
||||
return dtype in ("string[pyarrow]", "string[pyarrow_numpy]")
|
||||
|
||||
|
||||
def test_contains(any_string_dtype):
|
||||
values = np.array(
|
||||
["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
|
||||
)
|
||||
values = Series(values, dtype=any_string_dtype)
|
||||
pat = "mmm[_]+"
|
||||
|
||||
result = values.str.contains(pat)
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(
|
||||
np.array([False, np.nan, True, True, False], dtype=np.object_),
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.contains(pat, regex=False)
|
||||
expected = Series(
|
||||
np.array([False, np.nan, False, False, True], dtype=np.object_),
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
values = Series(
|
||||
np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
result = values.str.contains(pat)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# case insensitive using regex
|
||||
values = Series(
|
||||
np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
result = values.str.contains("FOO|mmm", case=False)
|
||||
expected = Series(np.array([True, False, True, True]), dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# case insensitive without regex
|
||||
result = values.str.contains("foo", regex=False, case=False)
|
||||
expected = Series(np.array([True, False, True, False]), dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# unicode
|
||||
values = Series(
|
||||
np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
pat = "mmm[_]+"
|
||||
|
||||
result = values.str.contains(pat)
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(
|
||||
np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.contains(pat, na=False)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
values = Series(
|
||||
np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_),
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
result = values.str.contains(pat)
|
||||
expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_contains_object_mixed():
|
||||
mixed = Series(
|
||||
np.array(
|
||||
["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
|
||||
dtype=object,
|
||||
)
|
||||
)
|
||||
result = mixed.str.contains("o")
|
||||
expected = Series(
|
||||
np.array(
|
||||
[False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan],
|
||||
dtype=np.object_,
|
||||
)
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_contains_na_kwarg_for_object_category():
|
||||
# gh 22158
|
||||
|
||||
# na for category
|
||||
values = Series(["a", "b", "c", "a", np.nan], dtype="category")
|
||||
result = values.str.contains("a", na=True)
|
||||
expected = Series([True, False, False, True, True])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.contains("a", na=False)
|
||||
expected = Series([True, False, False, True, False])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# na for objects
|
||||
values = Series(["a", "b", "c", "a", np.nan])
|
||||
result = values.str.contains("a", na=True)
|
||||
expected = Series([True, False, False, True, True])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.contains("a", na=False)
|
||||
expected = Series([True, False, False, True, False])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"na, expected",
|
||||
[
|
||||
(None, pd.NA),
|
||||
(True, True),
|
||||
(False, False),
|
||||
(0, False),
|
||||
(3, True),
|
||||
(np.nan, pd.NA),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("regex", [True, False])
|
||||
def test_contains_na_kwarg_for_nullable_string_dtype(
|
||||
nullable_string_dtype, na, expected, regex
|
||||
):
|
||||
# https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
|
||||
|
||||
values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
|
||||
result = values.str.contains("a", na=na, regex=regex)
|
||||
expected = Series([True, False, False, True, expected], dtype="boolean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_contains_moar(any_string_dtype):
|
||||
# PR #1179
|
||||
s = Series(
|
||||
["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
result = s.str.contains("a")
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(
|
||||
[False, False, False, True, True, False, np.nan, False, False, True],
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("a", case=False)
|
||||
expected = Series(
|
||||
[True, False, False, True, True, False, np.nan, True, False, True],
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("Aa")
|
||||
expected = Series(
|
||||
[False, False, False, True, False, False, np.nan, False, False, False],
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("ba")
|
||||
expected = Series(
|
||||
[False, False, False, True, False, False, np.nan, False, False, False],
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("ba", case=False)
|
||||
expected = Series(
|
||||
[False, False, False, True, True, False, np.nan, True, False, False],
|
||||
dtype=expected_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_contains_nan(any_string_dtype):
|
||||
# PR #14171
|
||||
s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.contains("foo", na=False)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([False, False, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("foo", na=True)
|
||||
expected = Series([True, True, True], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("foo", na="foo")
|
||||
if any_string_dtype == "object":
|
||||
expected = Series(["foo", "foo", "foo"], dtype=np.object_)
|
||||
elif any_string_dtype == "string[pyarrow_numpy]":
|
||||
expected = Series([True, True, True], dtype=np.bool_)
|
||||
else:
|
||||
expected = Series([True, True, True], dtype="boolean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.contains("foo")
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.startswith
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
|
||||
@pytest.mark.parametrize("dtype", ["object", "category"])
|
||||
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
|
||||
@pytest.mark.parametrize("na", [True, False])
|
||||
def test_startswith(pat, dtype, null_value, na):
|
||||
# add category dtype parametrizations for GH-36241
|
||||
values = Series(
|
||||
["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
result = values.str.startswith(pat)
|
||||
exp = Series([False, np.nan, True, False, False, np.nan, True])
|
||||
if dtype == "object" and null_value is pd.NA:
|
||||
# GH#18463
|
||||
exp = exp.fillna(null_value)
|
||||
elif dtype == "object" and null_value is None:
|
||||
exp[exp.isna()] = None
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
result = values.str.startswith(pat, na=na)
|
||||
exp = Series([False, na, True, False, False, na, True])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# mixed
|
||||
mixed = np.array(
|
||||
["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
|
||||
dtype=np.object_,
|
||||
)
|
||||
rs = Series(mixed).str.startswith("f")
|
||||
xp = Series([False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan])
|
||||
tm.assert_series_equal(rs, xp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [None, True, False])
|
||||
def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
|
||||
values = Series(
|
||||
["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
|
||||
dtype=nullable_string_dtype,
|
||||
)
|
||||
result = values.str.startswith("foo", na=na)
|
||||
exp = Series(
|
||||
[False, na, True, False, False, na, True, False, False], dtype="boolean"
|
||||
)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
result = values.str.startswith("rege.", na=na)
|
||||
exp = Series(
|
||||
[False, na, False, False, False, na, False, False, True], dtype="boolean"
|
||||
)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.endswith
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
|
||||
@pytest.mark.parametrize("dtype", ["object", "category"])
|
||||
@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
|
||||
@pytest.mark.parametrize("na", [True, False])
|
||||
def test_endswith(pat, dtype, null_value, na):
|
||||
# add category dtype parametrizations for GH-36241
|
||||
values = Series(
|
||||
["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
result = values.str.endswith(pat)
|
||||
exp = Series([False, np.nan, False, False, True, np.nan, True])
|
||||
if dtype == "object" and null_value is pd.NA:
|
||||
# GH#18463
|
||||
exp = exp.fillna(null_value)
|
||||
elif dtype == "object" and null_value is None:
|
||||
exp[exp.isna()] = None
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
result = values.str.endswith(pat, na=na)
|
||||
exp = Series([False, na, False, False, True, na, True])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# mixed
|
||||
mixed = np.array(
|
||||
["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
|
||||
dtype=object,
|
||||
)
|
||||
rs = Series(mixed).str.endswith("f")
|
||||
xp = Series([False, np.nan, False, np.nan, np.nan, False, None, np.nan, np.nan])
|
||||
tm.assert_series_equal(rs, xp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("na", [None, True, False])
|
||||
def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
|
||||
values = Series(
|
||||
["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
|
||||
dtype=nullable_string_dtype,
|
||||
)
|
||||
result = values.str.endswith("foo", na=na)
|
||||
exp = Series(
|
||||
[False, na, False, False, True, na, True, False, False], dtype="boolean"
|
||||
)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
result = values.str.endswith("rege.", na=na)
|
||||
exp = Series(
|
||||
[False, na, False, False, False, na, False, False, True], dtype="boolean"
|
||||
)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.replace
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_replace(any_string_dtype):
|
||||
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
|
||||
result = ser.str.replace("BAD[_]*", "", regex=True)
|
||||
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_max_replacements(any_string_dtype):
|
||||
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
|
||||
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
|
||||
result = ser.str.replace("BAD[_]*", "", n=1, regex=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype)
|
||||
result = ser.str.replace("BAD", "", n=1, regex=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_mixed_object():
|
||||
ser = Series(
|
||||
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
|
||||
)
|
||||
result = Series(ser).str.replace("BAD[_]*", "", regex=True)
|
||||
expected = Series(
|
||||
["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_unicode(any_string_dtype):
|
||||
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
|
||||
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
|
||||
@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
|
||||
def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, data):
|
||||
# https://github.com/pandas-dev/pandas/issues/13438
|
||||
msg = "repl must be a string or callable"
|
||||
obj = index_or_series(data, dtype=any_string_dtype)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
obj.str.replace("a", repl)
|
||||
|
||||
|
||||
def test_replace_callable(any_string_dtype):
|
||||
# GH 15055
|
||||
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
|
||||
# test with callable
|
||||
repl = lambda m: m.group(0).swapcase()
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
|
||||
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
|
||||
)
|
||||
def test_replace_callable_raises(any_string_dtype, repl):
|
||||
# GH 15055
|
||||
values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
|
||||
# test with wrong number of arguments, raising an error
|
||||
msg = (
|
||||
r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
|
||||
r"(?(3)required )positional arguments?"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
with tm.maybe_produces_warning(
|
||||
PerformanceWarning, using_pyarrow(any_string_dtype)
|
||||
):
|
||||
values.str.replace("a", repl, regex=True)
|
||||
|
||||
|
||||
def test_replace_callable_named_groups(any_string_dtype):
|
||||
# test regex named groups
|
||||
ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
|
||||
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
|
||||
repl = lambda m: m.group("middle").swapcase()
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(pat, repl, regex=True)
|
||||
expected = Series(["bAR", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_compiled_regex(any_string_dtype):
|
||||
# GH 15446
|
||||
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
|
||||
# test with compiled regex
|
||||
pat = re.compile(r"BAD_*")
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(pat, "", regex=True)
|
||||
expected = Series(["foobar", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(pat, "", n=1, regex=True)
|
||||
expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_compiled_regex_mixed_object():
|
||||
pat = re.compile(r"BAD_*")
|
||||
ser = Series(
|
||||
["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
|
||||
)
|
||||
result = Series(ser).str.replace(pat, "", regex=True)
|
||||
expected = Series(
|
||||
["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_compiled_regex_unicode(any_string_dtype):
|
||||
ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
|
||||
expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
|
||||
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(pat, ", ", regex=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_compiled_regex_raises(any_string_dtype):
|
||||
# case and flags provided to str.replace will have no effect
|
||||
# and will produce warnings
|
||||
ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
|
||||
pat = re.compile(r"BAD_*")
|
||||
|
||||
msg = "case and flags cannot be set when pat is a compiled regex"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.str.replace(pat, "", case=False, regex=True)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.str.replace(pat, "", case=True, regex=True)
|
||||
|
||||
|
||||
def test_replace_compiled_regex_callable(any_string_dtype):
|
||||
# test with callable
|
||||
ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
|
||||
repl = lambda m: m.group(0).swapcase()
|
||||
pat = re.compile("[a-z][A-Z]{2}")
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace(pat, repl, n=2, regex=True)
|
||||
expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"regex,expected", [(True, ["bao", "bao", np.nan]), (False, ["bao", "foo", np.nan])]
|
||||
)
|
||||
def test_replace_literal(regex, expected, any_string_dtype):
|
||||
# GH16808 literal replace (regex=False vs regex=True)
|
||||
ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
|
||||
expected = Series(expected, dtype=any_string_dtype)
|
||||
result = ser.str.replace("f.", "ba", regex=regex)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_literal_callable_raises(any_string_dtype):
|
||||
ser = Series([], dtype=any_string_dtype)
|
||||
repl = lambda m: m.group(0).swapcase()
|
||||
|
||||
msg = "Cannot use a callable replacement when regex=False"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.str.replace("abc", repl, regex=False)
|
||||
|
||||
|
||||
def test_replace_literal_compiled_raises(any_string_dtype):
|
||||
ser = Series([], dtype=any_string_dtype)
|
||||
pat = re.compile("[a-z][A-Z]{2}")
|
||||
|
||||
msg = "Cannot use a compiled regex as replacement pattern with regex=False"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
ser.str.replace(pat, "", regex=False)
|
||||
|
||||
|
||||
def test_replace_moar(any_string_dtype):
|
||||
# PR #1179
|
||||
ser = Series(
|
||||
["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
result = ser.str.replace("A", "YYY")
|
||||
expected = Series(
|
||||
["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace("A", "YYY", case=False)
|
||||
expected = Series(
|
||||
[
|
||||
"YYY",
|
||||
"B",
|
||||
"C",
|
||||
"YYYYYYbYYY",
|
||||
"BYYYcYYY",
|
||||
"",
|
||||
np.nan,
|
||||
"CYYYBYYY",
|
||||
"dog",
|
||||
"cYYYt",
|
||||
],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
|
||||
expected = Series(
|
||||
[
|
||||
"A",
|
||||
"B",
|
||||
"C",
|
||||
"XX-XX ba",
|
||||
"XX-XX ca",
|
||||
"",
|
||||
np.nan,
|
||||
"XX-XX BA",
|
||||
"XX-XX ",
|
||||
"XX-XX t",
|
||||
],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_not_case_sensitive_not_regex(any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/41602
|
||||
ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace("a", "c", case=False, regex=False)
|
||||
expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.replace("a.", "c.", case=False, regex=False)
|
||||
expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_replace_regex(any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/pull/24809
|
||||
s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype)
|
||||
result = s.str.replace("^.$", "a", regex=True)
|
||||
expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("regex", [True, False])
|
||||
def test_replace_regex_single_character(regex, any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0
|
||||
# GH 24804
|
||||
s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.replace(".", "a", regex=regex)
|
||||
if regex:
|
||||
expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype)
|
||||
else:
|
||||
expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.match
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_match(any_string_dtype):
|
||||
# New match behavior introduced in 0.13
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
|
||||
values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
|
||||
result = values.str.match(".*(BAD[_]+).*(BAD)")
|
||||
expected = Series([True, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
values = Series(
|
||||
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
|
||||
)
|
||||
result = values.str.match(".*BAD[_]+.*BAD")
|
||||
expected = Series([True, True, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.match("BAD[_]+.*BAD")
|
||||
expected = Series([False, True, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
values = Series(
|
||||
["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
|
||||
)
|
||||
result = values.str.match("^BAD[_]+.*BAD")
|
||||
expected = Series([False, False, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = values.str.match("\\^BAD[_]+.*BAD")
|
||||
expected = Series([False, True, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_match_mixed_object():
|
||||
mixed = Series(
|
||||
[
|
||||
"aBAD_BAD",
|
||||
np.nan,
|
||||
"BAD_b_BAD",
|
||||
True,
|
||||
datetime.today(),
|
||||
"foo",
|
||||
None,
|
||||
1,
|
||||
2.0,
|
||||
]
|
||||
)
|
||||
result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
|
||||
expected = Series([True, np.nan, True, np.nan, np.nan, False, None, np.nan, np.nan])
|
||||
assert isinstance(result, Series)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_match_na_kwarg(any_string_dtype):
|
||||
# GH #6609
|
||||
s = Series(["a", "b", np.nan], dtype=any_string_dtype)
|
||||
|
||||
result = s.str.match("a", na=False)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([True, False, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = s.str.match("a")
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([True, False, np.nan], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_match_case_kwarg(any_string_dtype):
|
||||
values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
|
||||
result = values.str.match("ab", case=False)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([True, True, True, True], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.fullmatch
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_fullmatch(any_string_dtype):
|
||||
# GH 32806
|
||||
ser = Series(
|
||||
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
|
||||
)
|
||||
result = ser.str.fullmatch(".*BAD[_]+.*BAD")
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_fullmatch_dollar_literal(any_string_dtype):
|
||||
# GH 56652
|
||||
ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
|
||||
result = ser.str.fullmatch("foo\\$")
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([False, False, np.nan, True], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_fullmatch_na_kwarg(any_string_dtype):
|
||||
ser = Series(
|
||||
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
|
||||
)
|
||||
result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series([True, False, False, False], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_fullmatch_case_kwarg(any_string_dtype):
|
||||
ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
|
||||
expected_dtype = np.bool_ if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
|
||||
expected = Series([True, False, False, False], dtype=expected_dtype)
|
||||
|
||||
result = ser.str.fullmatch("ab", case=True)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
expected = Series([True, True, False, False], dtype=expected_dtype)
|
||||
|
||||
result = ser.str.fullmatch("ab", case=False)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, using_pyarrow(any_string_dtype)):
|
||||
result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.findall
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_findall(any_string_dtype):
|
||||
ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype)
|
||||
result = ser.str.findall("BAD[_]*")
|
||||
expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
|
||||
expected = _convert_na_value(ser, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_findall_mixed_object():
|
||||
ser = Series(
|
||||
[
|
||||
"fooBAD__barBAD",
|
||||
np.nan,
|
||||
"foo",
|
||||
True,
|
||||
datetime.today(),
|
||||
"BAD",
|
||||
None,
|
||||
1,
|
||||
2.0,
|
||||
]
|
||||
)
|
||||
|
||||
result = ser.str.findall("BAD[_]*")
|
||||
expected = Series(
|
||||
[
|
||||
["BAD__", "BAD"],
|
||||
np.nan,
|
||||
[],
|
||||
np.nan,
|
||||
np.nan,
|
||||
["BAD"],
|
||||
None,
|
||||
np.nan,
|
||||
np.nan,
|
||||
]
|
||||
)
|
||||
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.find
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_find(any_string_dtype):
|
||||
ser = Series(
|
||||
["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype
|
||||
)
|
||||
expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
|
||||
result = ser.str.find("EF")
|
||||
expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.find("EF") for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
result = ser.str.rfind("EF")
|
||||
expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.rfind("EF") for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
result = ser.str.find("EF", 3)
|
||||
expected = Series([4, 3, 7, 4, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.find("EF", 3) for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
result = ser.str.rfind("EF", 3)
|
||||
expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.rfind("EF", 3) for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
result = ser.str.find("EF", 3, 6)
|
||||
expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.find("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
result = ser.str.rfind("EF", 3, 6)
|
||||
expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
expected = np.array([v.rfind("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
|
||||
tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
|
||||
|
||||
|
||||
def test_find_bad_arg_raises(any_string_dtype):
|
||||
ser = Series([], dtype=any_string_dtype)
|
||||
with pytest.raises(TypeError, match="expected a string object, not int"):
|
||||
ser.str.find(0)
|
||||
|
||||
with pytest.raises(TypeError, match="expected a string object, not int"):
|
||||
ser.str.rfind(0)
|
||||
|
||||
|
||||
def test_find_nan(any_string_dtype):
|
||||
ser = Series(
|
||||
["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype
|
||||
)
|
||||
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
|
||||
result = ser.str.find("EF")
|
||||
expected = Series([4, np.nan, 1, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.rfind("EF")
|
||||
expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.find("EF", 3)
|
||||
expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.rfind("EF", 3)
|
||||
expected = Series([4, np.nan, 7, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.find("EF", 3, 6)
|
||||
expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.rfind("EF", 3, 6)
|
||||
expected = Series([4, np.nan, -1, np.nan, -1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
# str.translate
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
def test_translate(index_or_series, any_string_dtype, infer_string):
|
||||
obj = index_or_series(
|
||||
["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype
|
||||
)
|
||||
table = str.maketrans("abc", "cde")
|
||||
result = obj.str.translate(table)
|
||||
expected = index_or_series(
|
||||
["cdedefg", "cdee", "edddfg", "edefggg"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
def test_translate_mixed_object():
|
||||
# Series with non-string values
|
||||
s = Series(["a", "b", "c", 1.2])
|
||||
table = str.maketrans("abc", "cde")
|
||||
expected = Series(["c", "d", "e", np.nan], dtype=object)
|
||||
result = s.str.translate(table)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_flags_kwarg(any_string_dtype):
|
||||
data = {
|
||||
"Dave": "dave@google.com",
|
||||
"Steve": "steve@gmail.com",
|
||||
"Rob": "rob@gmail.com",
|
||||
"Wes": np.nan,
|
||||
}
|
||||
data = Series(data, dtype=any_string_dtype)
|
||||
|
||||
pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
|
||||
|
||||
use_pyarrow = using_pyarrow(any_string_dtype)
|
||||
|
||||
result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
|
||||
assert result.iloc[0].tolist() == ["dave", "google", "com"]
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow):
|
||||
result = data.str.match(pat, flags=re.IGNORECASE)
|
||||
assert result.iloc[0]
|
||||
|
||||
with tm.maybe_produces_warning(PerformanceWarning, use_pyarrow):
|
||||
result = data.str.fullmatch(pat, flags=re.IGNORECASE)
|
||||
assert result.iloc[0]
|
||||
|
||||
result = data.str.findall(pat, flags=re.IGNORECASE)
|
||||
assert result.iloc[0][0] == ("dave", "google", "com")
|
||||
|
||||
result = data.str.count(pat, flags=re.IGNORECASE)
|
||||
assert result.iloc[0] == 1
|
||||
|
||||
msg = "has match groups"
|
||||
with tm.assert_produces_warning(
|
||||
UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow
|
||||
):
|
||||
result = data.str.contains(pat, flags=re.IGNORECASE)
|
||||
assert result.iloc[0]
|
@ -0,0 +1,53 @@
|
||||
import numpy as np
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
)
|
||||
|
||||
|
||||
def test_get_dummies(any_string_dtype):
|
||||
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
|
||||
result = s.str.get_dummies("|")
|
||||
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
s = Series(["a;b", "a", 7], dtype=any_string_dtype)
|
||||
result = s.str.get_dummies(";")
|
||||
expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_get_dummies_index():
|
||||
# GH9980, GH8028
|
||||
idx = Index(["a|b", "a|c", "b|c"])
|
||||
result = idx.str.get_dummies("|")
|
||||
|
||||
expected = MultiIndex.from_tuples(
|
||||
[(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
|
||||
)
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_get_dummies_with_name_dummy(any_string_dtype):
|
||||
# GH 12180
|
||||
# Dummies named 'name' should work as expected
|
||||
s = Series(["a", "b,name", "b"], dtype=any_string_dtype)
|
||||
result = s.str.get_dummies(",")
|
||||
expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_get_dummies_with_name_dummy_index():
|
||||
# GH 12180
|
||||
# Dummies named 'name' should work as expected
|
||||
idx = Index(["a|b", "name|c", "b|name"])
|
||||
result = idx.str.get_dummies("|")
|
||||
|
||||
expected = MultiIndex.from_tuples(
|
||||
[(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
|
||||
)
|
||||
tm.assert_index_equal(result, expected)
|
@ -0,0 +1,734 @@
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
)
|
||||
from pandas.tests.strings import (
|
||||
_convert_na_value,
|
||||
object_pyarrow_numpy,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["split", "rsplit"])
|
||||
def test_split(any_string_dtype, method):
|
||||
values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
|
||||
|
||||
result = getattr(values.str, method)("_")
|
||||
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
|
||||
exp = _convert_na_value(values, exp)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["split", "rsplit"])
|
||||
def test_split_more_than_one_char(any_string_dtype, method):
|
||||
# more than one char
|
||||
values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
|
||||
result = getattr(values.str, method)("__")
|
||||
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
|
||||
exp = _convert_na_value(values, exp)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
result = getattr(values.str, method)("__", expand=False)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_more_regex_split(any_string_dtype):
|
||||
# regex split
|
||||
values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
|
||||
result = values.str.split("[,_]")
|
||||
exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
|
||||
exp = _convert_na_value(values, exp)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_regex(any_string_dtype):
|
||||
# GH 43563
|
||||
# explicit regex = True split
|
||||
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
|
||||
result = values.str.split(r"\.jpg", regex=True)
|
||||
exp = Series([["xxxjpgzzz", ""]])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_regex_explicit(any_string_dtype):
|
||||
# explicit regex = True split with compiled regex
|
||||
regex_pat = re.compile(r".jpg")
|
||||
values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
|
||||
result = values.str.split(regex_pat)
|
||||
exp = Series([["xx", "zzz", ""]])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# explicit regex = False split
|
||||
result = values.str.split(r"\.jpg", regex=False)
|
||||
exp = Series([["xxxjpgzzz.jpg"]])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# non explicit regex split, pattern length == 1
|
||||
result = values.str.split(r".")
|
||||
exp = Series([["xxxjpgzzz", "jpg"]])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# non explicit regex split, pattern length != 1
|
||||
result = values.str.split(r".jpg")
|
||||
exp = Series([["xx", "zzz", ""]])
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
# regex=False with pattern compiled regex raises error
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Cannot use a compiled regex as replacement pattern with regex=False",
|
||||
):
|
||||
values.str.split(regex_pat, regex=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("expand", [None, False])
|
||||
@pytest.mark.parametrize("method", ["split", "rsplit"])
|
||||
def test_split_object_mixed(expand, method):
|
||||
mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
|
||||
result = getattr(mixed.str, method)("_", expand=expand)
|
||||
exp = Series(
|
||||
[
|
||||
["a", "b", "c"],
|
||||
np.nan,
|
||||
["d", "e", "f"],
|
||||
np.nan,
|
||||
np.nan,
|
||||
None,
|
||||
np.nan,
|
||||
np.nan,
|
||||
]
|
||||
)
|
||||
assert isinstance(result, Series)
|
||||
tm.assert_almost_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["split", "rsplit"])
|
||||
@pytest.mark.parametrize("n", [None, 0])
|
||||
def test_split_n(any_string_dtype, method, n):
|
||||
s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
|
||||
expected = Series([["a", "b"], pd.NA, ["b", "c"]])
|
||||
result = getattr(s.str, method)(" ", n=n)
|
||||
expected = _convert_na_value(s, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_rsplit(any_string_dtype):
|
||||
# regex split is not supported by rsplit
|
||||
values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
|
||||
result = values.str.rsplit("[,_]")
|
||||
exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
|
||||
exp = _convert_na_value(values, exp)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_rsplit_max_number(any_string_dtype):
|
||||
# setting max number of splits, make sure it's from reverse
|
||||
values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
|
||||
result = values.str.rsplit("_", n=1)
|
||||
exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
|
||||
exp = _convert_na_value(values, exp)
|
||||
tm.assert_series_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_blank_string(any_string_dtype):
|
||||
# expand blank split GH 20067
|
||||
values = Series([""], name="test", dtype=any_string_dtype)
|
||||
result = values.str.split(expand=True)
|
||||
exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_blank_string_with_non_empty(any_string_dtype):
|
||||
values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
|
||||
result = values.str.split(expand=True)
|
||||
exp = DataFrame(
|
||||
[
|
||||
["a", "b", "c"],
|
||||
["a", "b", None],
|
||||
[None, None, None],
|
||||
[None, None, None],
|
||||
],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["split", "rsplit"])
|
||||
def test_split_noargs(any_string_dtype, method):
|
||||
# #1859
|
||||
s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)()
|
||||
expected = ["Travis", "Oliphant"]
|
||||
assert result[1] == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, pat",
|
||||
[
|
||||
(["bd asdf jfg", "kjasdflqw asdfnfk"], None),
|
||||
(["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
|
||||
(["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("n", [-1, 0])
|
||||
def test_split_maxsplit(data, pat, any_string_dtype, n):
|
||||
# re.split 0, str.split -1
|
||||
s = Series(data, dtype=any_string_dtype)
|
||||
|
||||
result = s.str.split(pat=pat, n=n)
|
||||
xp = s.str.split(pat=pat)
|
||||
tm.assert_series_equal(result, xp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, pat, expected",
|
||||
[
|
||||
(
|
||||
["split once", "split once too!"],
|
||||
None,
|
||||
Series({0: ["split", "once"], 1: ["split", "once too!"]}),
|
||||
),
|
||||
(
|
||||
["split_once", "split_once_too!"],
|
||||
"_",
|
||||
Series({0: ["split", "once"], 1: ["split", "once_too!"]}),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype):
|
||||
s = Series(data, dtype=any_string_dtype)
|
||||
result = s.str.split(pat=pat, n=1)
|
||||
tm.assert_series_equal(expected, result, check_index_type=False)
|
||||
|
||||
|
||||
def test_split_to_dataframe_no_splits(any_string_dtype):
|
||||
s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
|
||||
result = s.str.split("_", expand=True)
|
||||
exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_to_dataframe(any_string_dtype):
|
||||
s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
|
||||
result = s.str.split("_", expand=True)
|
||||
exp = DataFrame(
|
||||
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_to_dataframe_unequal_splits(any_string_dtype):
|
||||
s = Series(
|
||||
["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.split("_", expand=True)
|
||||
exp = DataFrame(
|
||||
{
|
||||
0: ["some", "one"],
|
||||
1: ["unequal", "of"],
|
||||
2: ["splits", "these"],
|
||||
3: [None, "things"],
|
||||
4: [None, "is"],
|
||||
5: [None, "not"],
|
||||
},
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_split_to_dataframe_with_index(any_string_dtype):
|
||||
s = Series(
|
||||
["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.split("_", expand=True)
|
||||
exp = DataFrame(
|
||||
{0: ["some", "with"], 1: ["splits", "index"]},
|
||||
index=["preserve", "me"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
with pytest.raises(ValueError, match="expand must be"):
|
||||
s.str.split("_", expand="not_a_boolean")
|
||||
|
||||
|
||||
def test_split_to_multiindex_expand_no_splits():
|
||||
# https://github.com/pandas-dev/pandas/issues/23677
|
||||
|
||||
idx = Index(["nosplit", "alsonosplit", np.nan])
|
||||
result = idx.str.split("_", expand=True)
|
||||
exp = idx
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 1
|
||||
|
||||
|
||||
def test_split_to_multiindex_expand():
|
||||
idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
|
||||
result = idx.str.split("_", expand=True)
|
||||
exp = MultiIndex.from_tuples(
|
||||
[
|
||||
("some", "equal", "splits"),
|
||||
("with", "no", "nans"),
|
||||
[np.nan, np.nan, np.nan],
|
||||
[None, None, None],
|
||||
]
|
||||
)
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 3
|
||||
|
||||
|
||||
def test_split_to_multiindex_expand_unequal_splits():
|
||||
idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
|
||||
result = idx.str.split("_", expand=True)
|
||||
exp = MultiIndex.from_tuples(
|
||||
[
|
||||
("some", "unequal", "splits", np.nan, np.nan, np.nan),
|
||||
("one", "of", "these", "things", "is", "not"),
|
||||
(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
|
||||
(None, None, None, None, None, None),
|
||||
]
|
||||
)
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 6
|
||||
|
||||
with pytest.raises(ValueError, match="expand must be"):
|
||||
idx.str.split("_", expand="not_a_boolean")
|
||||
|
||||
|
||||
def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype):
|
||||
s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
|
||||
result = s.str.rsplit("_", expand=True)
|
||||
exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_rsplit_to_dataframe_expand(any_string_dtype):
|
||||
s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
|
||||
result = s.str.rsplit("_", expand=True)
|
||||
exp = DataFrame(
|
||||
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = s.str.rsplit("_", expand=True, n=2)
|
||||
exp = DataFrame(
|
||||
{0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
result = s.str.rsplit("_", expand=True, n=1)
|
||||
exp = DataFrame(
|
||||
{0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_rsplit_to_dataframe_expand_with_index(any_string_dtype):
|
||||
s = Series(
|
||||
["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
|
||||
)
|
||||
result = s.str.rsplit("_", expand=True)
|
||||
exp = DataFrame(
|
||||
{0: ["some", "with"], 1: ["splits", "index"]},
|
||||
index=["preserve", "me"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
|
||||
def test_rsplit_to_multiindex_expand_no_split():
|
||||
idx = Index(["nosplit", "alsonosplit"])
|
||||
result = idx.str.rsplit("_", expand=True)
|
||||
exp = idx
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 1
|
||||
|
||||
|
||||
def test_rsplit_to_multiindex_expand():
|
||||
idx = Index(["some_equal_splits", "with_no_nans"])
|
||||
result = idx.str.rsplit("_", expand=True)
|
||||
exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")])
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 3
|
||||
|
||||
|
||||
def test_rsplit_to_multiindex_expand_n():
|
||||
idx = Index(["some_equal_splits", "with_no_nans"])
|
||||
result = idx.str.rsplit("_", expand=True, n=1)
|
||||
exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == 2
|
||||
|
||||
|
||||
def test_split_nan_expand(any_string_dtype):
|
||||
# gh-18450
|
||||
s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
|
||||
result = s.str.split(",", expand=True)
|
||||
exp = DataFrame(
|
||||
[["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, exp)
|
||||
|
||||
# check that these are actually np.nan/pd.NA and not None
|
||||
# TODO see GH 18463
|
||||
# tm.assert_frame_equal does not differentiate
|
||||
if any_string_dtype in object_pyarrow_numpy:
|
||||
assert all(np.isnan(x) for x in result.iloc[1])
|
||||
else:
|
||||
assert all(x is pd.NA for x in result.iloc[1])
|
||||
|
||||
|
||||
def test_split_with_name_series(any_string_dtype):
|
||||
# GH 12617
|
||||
|
||||
# should preserve name
|
||||
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
|
||||
res = s.str.split(",")
|
||||
exp = Series([["a", "b"], ["c", "d"]], name="xxx")
|
||||
tm.assert_series_equal(res, exp)
|
||||
|
||||
res = s.str.split(",", expand=True)
|
||||
exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
|
||||
tm.assert_frame_equal(res, exp)
|
||||
|
||||
|
||||
def test_split_with_name_index():
|
||||
# GH 12617
|
||||
idx = Index(["a,b", "c,d"], name="xxx")
|
||||
res = idx.str.split(",")
|
||||
exp = Index([["a", "b"], ["c", "d"]], name="xxx")
|
||||
assert res.nlevels == 1
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
res = idx.str.split(",", expand=True)
|
||||
exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
|
||||
assert res.nlevels == 2
|
||||
tm.assert_index_equal(res, exp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
[
|
||||
("a", "__", "b__c"),
|
||||
("c", "__", "d__e"),
|
||||
np.nan,
|
||||
("f", "__", "g__h"),
|
||||
None,
|
||||
],
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
[
|
||||
("a__b", "__", "c"),
|
||||
("c__d", "__", "e"),
|
||||
np.nan,
|
||||
("f__g", "__", "h"),
|
||||
None,
|
||||
],
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
# more than one char
|
||||
s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)("__", expand=False)
|
||||
expected = Series(exp)
|
||||
expected = _convert_na_value(s, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
[("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None],
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
[("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None],
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_series_none(any_string_dtype, method, exp):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
# None
|
||||
s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)(expand=False)
|
||||
expected = Series(exp)
|
||||
expected = _convert_na_value(s, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
[("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None],
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
[("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None],
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_series_not_split(any_string_dtype, method, exp):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
# Not split
|
||||
s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)("_", expand=False)
|
||||
expected = Series(exp)
|
||||
expected = _convert_na_value(s, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
[("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")],
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
[("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")],
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_series_unicode(any_string_dtype, method, exp):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
# unicode
|
||||
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
|
||||
|
||||
result = getattr(s.str, method)("_", expand=False)
|
||||
expected = Series(exp)
|
||||
expected = _convert_na_value(s, expected)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["partition", "rpartition"])
|
||||
def test_partition_series_stdlib(any_string_dtype, method):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
# compare to standard lib
|
||||
s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)("_", expand=False).tolist()
|
||||
assert result == [getattr(v, method)("_") for v in s]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expand, exp, exp_levels",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
False,
|
||||
np.array(
|
||||
[("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
|
||||
dtype=object,
|
||||
),
|
||||
1,
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
False,
|
||||
np.array(
|
||||
[("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
|
||||
dtype=object,
|
||||
),
|
||||
1,
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_index(method, expand, exp, exp_levels):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
|
||||
values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
|
||||
|
||||
result = getattr(values.str, method)("_", expand=expand)
|
||||
exp = Index(exp)
|
||||
tm.assert_index_equal(result, exp)
|
||||
assert result.nlevels == exp_levels
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
{
|
||||
0: ["a", "c", np.nan, "f", None],
|
||||
1: ["_", "_", np.nan, "_", None],
|
||||
2: ["b_c", "d_e", np.nan, "g_h", None],
|
||||
},
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
{
|
||||
0: ["a_b", "c_d", np.nan, "f_g", None],
|
||||
1: ["_", "_", np.nan, "_", None],
|
||||
2: ["c", "e", np.nan, "h", None],
|
||||
},
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_to_dataframe(any_string_dtype, method, exp):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
|
||||
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)("_")
|
||||
expected = DataFrame(
|
||||
exp,
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
[
|
||||
"partition",
|
||||
{
|
||||
0: ["a", "c", np.nan, "f", None],
|
||||
1: ["_", "_", np.nan, "_", None],
|
||||
2: ["b_c", "d_e", np.nan, "g_h", None],
|
||||
},
|
||||
],
|
||||
[
|
||||
"rpartition",
|
||||
{
|
||||
0: ["a_b", "c_d", np.nan, "f_g", None],
|
||||
1: ["_", "_", np.nan, "_", None],
|
||||
2: ["c", "e", np.nan, "h", None],
|
||||
},
|
||||
],
|
||||
],
|
||||
)
|
||||
def test_partition_to_dataframe_from_series(any_string_dtype, method, exp):
|
||||
# https://github.com/pandas-dev/pandas/issues/23558
|
||||
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
|
||||
result = getattr(s.str, method)("_", expand=True)
|
||||
expected = DataFrame(
|
||||
exp,
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_partition_with_name(any_string_dtype):
|
||||
# GH 12617
|
||||
|
||||
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
|
||||
result = s.str.partition(",")
|
||||
expected = DataFrame(
|
||||
{0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_partition_with_name_expand(any_string_dtype):
|
||||
# GH 12617
|
||||
# should preserve name
|
||||
s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
|
||||
result = s.str.partition(",", expand=False)
|
||||
expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_partition_index_with_name():
|
||||
idx = Index(["a,b", "c,d"], name="xxx")
|
||||
result = idx.str.partition(",")
|
||||
expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
|
||||
assert result.nlevels == 3
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
def test_partition_index_with_name_expand_false():
|
||||
idx = Index(["a,b", "c,d"], name="xxx")
|
||||
# should preserve name
|
||||
result = idx.str.partition(",", expand=False)
|
||||
expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
|
||||
assert result.nlevels == 1
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["partition", "rpartition"])
|
||||
def test_partition_sep_kwarg(any_string_dtype, method):
|
||||
# GH 22676; depr kwarg "pat" in favor of "sep"
|
||||
s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
|
||||
|
||||
expected = getattr(s.str, method)(sep="_")
|
||||
result = getattr(s.str, method)("_")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_get():
|
||||
ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
|
||||
result = ser.str.split("_").str.get(1)
|
||||
expected = Series(["b", "d", np.nan, "g"], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_get_mixed_object():
|
||||
ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
|
||||
result = ser.str.split("_").str.get(1)
|
||||
expected = Series(
|
||||
["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("idx", [2, -3])
|
||||
def test_get_bounds(idx):
|
||||
ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
|
||||
result = ser.str.split("_").str.get(idx)
|
||||
expected = Series(["3", "8", np.nan], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]]
|
||||
)
|
||||
def test_get_complex(idx, exp):
|
||||
# GH 20671, getting value not in dict raising `KeyError`
|
||||
ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
|
||||
|
||||
result = ser.str.get(idx)
|
||||
expected = Series(exp)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("to_type", [tuple, list, np.array])
|
||||
def test_get_complex_nested(to_type):
|
||||
ser = Series([to_type([to_type([1, 2])])])
|
||||
|
||||
result = ser.str.get(0)
|
||||
expected = Series([to_type([1, 2])])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.get(1)
|
||||
expected = Series([np.nan])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_get_strings(any_string_dtype):
|
||||
ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype)
|
||||
result = ser.str.get(2)
|
||||
expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
@ -0,0 +1,112 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs import lib
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
Series,
|
||||
_testing as tm,
|
||||
option_context,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:Falling back")
|
||||
def test_string_array(nullable_string_dtype, any_string_method):
|
||||
method_name, args, kwargs = any_string_method
|
||||
|
||||
data = ["a", "bb", np.nan, "ccc"]
|
||||
a = Series(data, dtype=object)
|
||||
b = Series(data, dtype=nullable_string_dtype)
|
||||
|
||||
if method_name == "decode":
|
||||
with pytest.raises(TypeError, match="a bytes-like object is required"):
|
||||
getattr(b.str, method_name)(*args, **kwargs)
|
||||
return
|
||||
|
||||
expected = getattr(a.str, method_name)(*args, **kwargs)
|
||||
result = getattr(b.str, method_name)(*args, **kwargs)
|
||||
|
||||
if isinstance(expected, Series):
|
||||
if expected.dtype == "object" and lib.is_string_array(
|
||||
expected.dropna().values,
|
||||
):
|
||||
assert result.dtype == nullable_string_dtype
|
||||
result = result.astype(object)
|
||||
|
||||
elif expected.dtype == "object" and lib.is_bool_array(
|
||||
expected.values, skipna=True
|
||||
):
|
||||
assert result.dtype == "boolean"
|
||||
result = result.astype(object)
|
||||
|
||||
elif expected.dtype == "bool":
|
||||
assert result.dtype == "boolean"
|
||||
result = result.astype("bool")
|
||||
|
||||
elif expected.dtype == "float" and expected.isna().any():
|
||||
assert result.dtype == "Int64"
|
||||
result = result.astype("float")
|
||||
|
||||
if expected.dtype == object:
|
||||
# GH#18463
|
||||
expected[expected.isna()] = NA
|
||||
|
||||
elif isinstance(expected, DataFrame):
|
||||
columns = expected.select_dtypes(include="object").columns
|
||||
assert all(result[columns].dtypes == nullable_string_dtype)
|
||||
result[columns] = result[columns].astype(object)
|
||||
with option_context("future.no_silent_downcasting", True):
|
||||
expected[columns] = expected[columns].fillna(NA) # GH#18463
|
||||
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,expected",
|
||||
[
|
||||
("count", [2, None]),
|
||||
("find", [0, None]),
|
||||
("index", [0, None]),
|
||||
("rindex", [2, None]),
|
||||
],
|
||||
)
|
||||
def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected):
|
||||
s = Series(["aba", None], dtype=nullable_string_dtype)
|
||||
result = getattr(s.str, method)("a")
|
||||
expected = Series(expected, dtype="Int64")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,expected",
|
||||
[
|
||||
("isdigit", [False, None, True]),
|
||||
("isalpha", [True, None, False]),
|
||||
("isalnum", [True, None, True]),
|
||||
("isnumeric", [False, None, True]),
|
||||
],
|
||||
)
|
||||
def test_string_array_boolean_array(nullable_string_dtype, method, expected):
|
||||
s = Series(["a", None, "1"], dtype=nullable_string_dtype)
|
||||
result = getattr(s.str, method)()
|
||||
expected = Series(expected, dtype="boolean")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_array_extract(nullable_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/30969
|
||||
# Only expand=False & multiple groups was failing
|
||||
|
||||
a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
|
||||
b = Series(["a1", "b2", "cc"], dtype="object")
|
||||
pat = r"(\w)(\d)"
|
||||
|
||||
result = a.str.extract(pat, expand=False)
|
||||
expected = b.str.extract(pat, expand=False)
|
||||
expected = expected.fillna(NA) # GH#18463
|
||||
assert all(result.dtypes == nullable_string_dtype)
|
||||
|
||||
result = result.astype(object)
|
||||
tm.assert_equal(result, expected)
|
@ -0,0 +1,720 @@
|
||||
from datetime import (
|
||||
datetime,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.strings.accessor import StringMethods
|
||||
from pandas.tests.strings import object_pyarrow_numpy
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
|
||||
def test_startswith_endswith_non_str_patterns(pattern):
|
||||
# GH3485
|
||||
ser = Series(["foo", "bar"])
|
||||
msg = f"expected a string or tuple, not {type(pattern).__name__}"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser.str.startswith(pattern)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
ser.str.endswith(pattern)
|
||||
|
||||
|
||||
def test_iter_raises():
|
||||
# GH 54173
|
||||
ser = Series(["foo", "bar"])
|
||||
with pytest.raises(TypeError, match="'StringMethods' object is not iterable"):
|
||||
iter(ser.str)
|
||||
|
||||
|
||||
# test integer/float dtypes (inferred by constructor) and mixed
|
||||
|
||||
|
||||
def test_count(any_string_dtype):
|
||||
ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
|
||||
result = ser.str.count("f[o]+")
|
||||
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
expected = Series([1, 2, np.nan, 4], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_count_mixed_object():
|
||||
ser = Series(
|
||||
["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
|
||||
dtype=object,
|
||||
)
|
||||
result = ser.str.count("a")
|
||||
expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_repeat(any_string_dtype):
|
||||
ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype)
|
||||
|
||||
result = ser.str.repeat(3)
|
||||
expected = Series(
|
||||
["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.repeat([1, 2, 3, 4, 5, 6])
|
||||
expected = Series(
|
||||
["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_repeat_mixed_object():
|
||||
ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
|
||||
result = ser.str.repeat(3)
|
||||
expected = Series(
|
||||
["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]])
|
||||
def test_repeat_with_null(any_string_dtype, arg, repeat):
|
||||
# GH: 31632
|
||||
ser = Series(["a", arg], dtype=any_string_dtype)
|
||||
result = ser.str.repeat([3, repeat])
|
||||
expected = Series(["aaa", None], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_empty_str_methods(any_string_dtype):
|
||||
empty_str = empty = Series(dtype=any_string_dtype)
|
||||
if any_string_dtype in object_pyarrow_numpy:
|
||||
empty_int = Series(dtype="int64")
|
||||
empty_bool = Series(dtype=bool)
|
||||
else:
|
||||
empty_int = Series(dtype="Int64")
|
||||
empty_bool = Series(dtype="boolean")
|
||||
empty_object = Series(dtype=object)
|
||||
empty_bytes = Series(dtype=object)
|
||||
empty_df = DataFrame()
|
||||
|
||||
# GH7241
|
||||
# (extract) on empty series
|
||||
|
||||
tm.assert_series_equal(empty_str, empty.str.cat(empty))
|
||||
assert "" == empty.str.cat()
|
||||
tm.assert_series_equal(empty_str, empty.str.title())
|
||||
tm.assert_series_equal(empty_int, empty.str.count("a"))
|
||||
tm.assert_series_equal(empty_bool, empty.str.contains("a"))
|
||||
tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
|
||||
tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
|
||||
tm.assert_series_equal(empty_str, empty.str.lower())
|
||||
tm.assert_series_equal(empty_str, empty.str.upper())
|
||||
tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
|
||||
tm.assert_series_equal(empty_str, empty.str.repeat(3))
|
||||
tm.assert_series_equal(empty_bool, empty.str.match("^a"))
|
||||
tm.assert_frame_equal(
|
||||
DataFrame(columns=[0], dtype=any_string_dtype),
|
||||
empty.str.extract("()", expand=True),
|
||||
)
|
||||
tm.assert_frame_equal(
|
||||
DataFrame(columns=[0, 1], dtype=any_string_dtype),
|
||||
empty.str.extract("()()", expand=True),
|
||||
)
|
||||
tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
|
||||
tm.assert_frame_equal(
|
||||
DataFrame(columns=[0, 1], dtype=any_string_dtype),
|
||||
empty.str.extract("()()", expand=False),
|
||||
)
|
||||
tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies())
|
||||
tm.assert_series_equal(empty_str, empty_str.str.join(""))
|
||||
tm.assert_series_equal(empty_int, empty.str.len())
|
||||
tm.assert_series_equal(empty_object, empty_str.str.findall("a"))
|
||||
tm.assert_series_equal(empty_int, empty.str.find("a"))
|
||||
tm.assert_series_equal(empty_int, empty.str.rfind("a"))
|
||||
tm.assert_series_equal(empty_str, empty.str.pad(42))
|
||||
tm.assert_series_equal(empty_str, empty.str.center(42))
|
||||
tm.assert_series_equal(empty_object, empty.str.split("a"))
|
||||
tm.assert_series_equal(empty_object, empty.str.rsplit("a"))
|
||||
tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False))
|
||||
tm.assert_frame_equal(empty_df, empty.str.partition("a"))
|
||||
tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False))
|
||||
tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
|
||||
tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
|
||||
tm.assert_series_equal(empty_str, empty.str.slice(step=1))
|
||||
tm.assert_series_equal(empty_str, empty.str.strip())
|
||||
tm.assert_series_equal(empty_str, empty.str.lstrip())
|
||||
tm.assert_series_equal(empty_str, empty.str.rstrip())
|
||||
tm.assert_series_equal(empty_str, empty.str.wrap(42))
|
||||
tm.assert_series_equal(empty_str, empty.str.get(0))
|
||||
tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii"))
|
||||
tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
|
||||
# ismethods should always return boolean (GH 29624)
|
||||
tm.assert_series_equal(empty_bool, empty.str.isalnum())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isalpha())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isdigit())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isspace())
|
||||
tm.assert_series_equal(empty_bool, empty.str.islower())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isupper())
|
||||
tm.assert_series_equal(empty_bool, empty.str.istitle())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isnumeric())
|
||||
tm.assert_series_equal(empty_bool, empty.str.isdecimal())
|
||||
tm.assert_series_equal(empty_str, empty.str.capitalize())
|
||||
tm.assert_series_equal(empty_str, empty.str.swapcase())
|
||||
tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
|
||||
|
||||
table = str.maketrans("a", "b")
|
||||
tm.assert_series_equal(empty_str, empty.str.translate(table))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected",
|
||||
[
|
||||
("isalnum", [True, True, True, True, True, False, True, True, False, False]),
|
||||
("isalpha", [True, True, True, False, False, False, True, False, False, False]),
|
||||
(
|
||||
"isdigit",
|
||||
[False, False, False, True, False, False, False, True, False, False],
|
||||
),
|
||||
(
|
||||
"isnumeric",
|
||||
[False, False, False, True, False, False, False, True, False, False],
|
||||
),
|
||||
(
|
||||
"isspace",
|
||||
[False, False, False, False, False, False, False, False, False, True],
|
||||
),
|
||||
(
|
||||
"islower",
|
||||
[False, True, False, False, False, False, False, False, False, False],
|
||||
),
|
||||
(
|
||||
"isupper",
|
||||
[True, False, False, False, True, False, True, False, False, False],
|
||||
),
|
||||
(
|
||||
"istitle",
|
||||
[True, False, True, False, True, False, False, False, False, False],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_ismethods(method, expected, any_string_dtype):
|
||||
ser = Series(
|
||||
["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype
|
||||
)
|
||||
expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(expected, dtype=expected_dtype)
|
||||
result = getattr(ser.str, method)()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# compare with standard library
|
||||
expected = [getattr(item, method)() for item in ser]
|
||||
assert list(result) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected",
|
||||
[
|
||||
("isnumeric", [False, True, True, False, True, True, False]),
|
||||
("isdecimal", [False, True, False, False, False, True, False]),
|
||||
],
|
||||
)
|
||||
def test_isnumeric_unicode(method, expected, any_string_dtype):
|
||||
# 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
|
||||
# 0x2605: ★ not number
|
||||
# 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
|
||||
# 0xFF13: 3 Em 3 # noqa: RUF003
|
||||
ser = Series(
|
||||
["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype # noqa: RUF001
|
||||
)
|
||||
expected_dtype = "bool" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(expected, dtype=expected_dtype)
|
||||
result = getattr(ser.str, method)()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# compare with standard library
|
||||
expected = [getattr(item, method)() for item in ser]
|
||||
assert list(result) == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, expected",
|
||||
[
|
||||
("isnumeric", [False, np.nan, True, False, np.nan, True, False]),
|
||||
("isdecimal", [False, np.nan, False, False, np.nan, True, False]),
|
||||
],
|
||||
)
|
||||
def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
|
||||
values = ["A", np.nan, "¼", "★", np.nan, "3", "four"] # noqa: RUF001
|
||||
ser = Series(values, dtype=any_string_dtype)
|
||||
expected_dtype = "object" if any_string_dtype in object_pyarrow_numpy else "boolean"
|
||||
expected = Series(expected, dtype=expected_dtype)
|
||||
result = getattr(ser.str, method)()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_spilt_join_roundtrip(any_string_dtype):
|
||||
ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
|
||||
result = ser.str.split("_").str.join("_")
|
||||
expected = ser.astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_spilt_join_roundtrip_mixed_object():
|
||||
ser = Series(
|
||||
["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
|
||||
)
|
||||
result = ser.str.split("_").str.join("_")
|
||||
expected = Series(
|
||||
["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan],
|
||||
dtype=object,
|
||||
)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_len(any_string_dtype):
|
||||
ser = Series(
|
||||
["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
result = ser.str.len()
|
||||
expected_dtype = "float64" if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_len_mixed():
|
||||
ser = Series(
|
||||
["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
|
||||
)
|
||||
result = ser.str.len()
|
||||
expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method,sub,start,end,expected",
|
||||
[
|
||||
("index", "EF", None, None, [4, 3, 1, 0]),
|
||||
("rindex", "EF", None, None, [4, 5, 7, 4]),
|
||||
("index", "EF", 3, None, [4, 3, 7, 4]),
|
||||
("rindex", "EF", 3, None, [4, 5, 7, 4]),
|
||||
("index", "E", 4, 8, [4, 5, 7, 4]),
|
||||
("rindex", "E", 0, 5, [4, 3, 1, 4]),
|
||||
],
|
||||
)
|
||||
def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected):
|
||||
obj = index_or_series(
|
||||
["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
|
||||
)
|
||||
expected_dtype = np.int64 if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
expected = index_or_series(expected, dtype=expected_dtype)
|
||||
|
||||
result = getattr(obj.str, method)(sub, start, end)
|
||||
|
||||
if index_or_series is Series:
|
||||
tm.assert_series_equal(result, expected)
|
||||
else:
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
# compare with standard library
|
||||
expected = [getattr(item, method)(sub, start, end) for item in obj]
|
||||
assert list(result) == expected
|
||||
|
||||
|
||||
def test_index_not_found_raises(index_or_series, any_string_dtype):
|
||||
obj = index_or_series(
|
||||
["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
|
||||
)
|
||||
with pytest.raises(ValueError, match="substring not found"):
|
||||
obj.str.index("DE")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("method", ["index", "rindex"])
|
||||
def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
|
||||
obj = index_or_series([], dtype=any_string_dtype)
|
||||
msg = "expected a string object, not int"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
getattr(obj.str, method)(0)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
["index", [1, 1, 0]],
|
||||
["rindex", [3, 1, 2]],
|
||||
],
|
||||
)
|
||||
def test_index_missing(any_string_dtype, method, exp):
|
||||
ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
|
||||
expected_dtype = np.float64 if any_string_dtype in object_pyarrow_numpy else "Int64"
|
||||
|
||||
result = getattr(ser.str, method)("b")
|
||||
expected = Series(exp + [np.nan], dtype=expected_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_pipe_failures(any_string_dtype):
|
||||
# #2119
|
||||
ser = Series(["A|B|C"], dtype=any_string_dtype)
|
||||
|
||||
result = ser.str.split("|")
|
||||
expected = Series([["A", "B", "C"]], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str.replace("|", " ", regex=False)
|
||||
expected = Series(["A B C"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, step, expected",
|
||||
[
|
||||
(2, 5, None, ["foo", "bar", np.nan, "baz"]),
|
||||
(0, 3, -1, ["", "", np.nan, ""]),
|
||||
(None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
|
||||
(3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
|
||||
(3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
|
||||
],
|
||||
)
|
||||
def test_slice(start, stop, step, expected, any_string_dtype):
|
||||
ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype)
|
||||
result = ser.str.slice(start, stop, step)
|
||||
expected = Series(expected, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start, stop, step, expected",
|
||||
[
|
||||
(2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, None, np.nan, np.nan]),
|
||||
(4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, None, np.nan, np.nan]),
|
||||
],
|
||||
)
|
||||
def test_slice_mixed_object(start, stop, step, expected):
|
||||
ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0])
|
||||
result = ser.str.slice(start, stop, step)
|
||||
expected = Series(expected, dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"start,stop,repl,expected",
|
||||
[
|
||||
(2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]),
|
||||
(2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]),
|
||||
(2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
|
||||
(2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
|
||||
(-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]),
|
||||
(None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]),
|
||||
(6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]),
|
||||
(-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]),
|
||||
],
|
||||
)
|
||||
def test_slice_replace(start, stop, repl, expected, any_string_dtype):
|
||||
ser = Series(
|
||||
["short", "a bit longer", "evenlongerthanthat", "", np.nan],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
expected = Series(expected, dtype=any_string_dtype)
|
||||
result = ser.str.slice_replace(start, stop, repl)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
["strip", ["aa", "bb", np.nan, "cc"]],
|
||||
["lstrip", ["aa ", "bb \n", np.nan, "cc "]],
|
||||
["rstrip", [" aa", " bb", np.nan, "cc"]],
|
||||
],
|
||||
)
|
||||
def test_strip_lstrip_rstrip(any_string_dtype, method, exp):
|
||||
ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype)
|
||||
|
||||
result = getattr(ser.str, method)()
|
||||
expected = Series(exp, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
["strip", ["aa", np.nan, "bb"]],
|
||||
["lstrip", ["aa ", np.nan, "bb \t\n"]],
|
||||
["rstrip", [" aa", np.nan, " bb"]],
|
||||
],
|
||||
)
|
||||
def test_strip_lstrip_rstrip_mixed_object(method, exp):
|
||||
ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0])
|
||||
|
||||
result = getattr(ser.str, method)()
|
||||
expected = Series(exp + [np.nan, np.nan, None, np.nan, np.nan], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"method, exp",
|
||||
[
|
||||
["strip", ["ABC", " BNSD", "LDFJH "]],
|
||||
["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]],
|
||||
["rstrip", ["xxABC", "xx BNSD", "LDFJH "]],
|
||||
],
|
||||
)
|
||||
def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp):
|
||||
ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype)
|
||||
|
||||
result = getattr(ser.str, method)("x")
|
||||
expected = Series(exp, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
|
||||
)
|
||||
def test_removeprefix(any_string_dtype, prefix, expected):
|
||||
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
|
||||
result = ser.str.removeprefix(prefix)
|
||||
ser_expected = Series(expected, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, ser_expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
|
||||
)
|
||||
def test_removesuffix(any_string_dtype, suffix, expected):
|
||||
ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
|
||||
result = ser.str.removesuffix(suffix)
|
||||
ser_expected = Series(expected, dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, ser_expected)
|
||||
|
||||
|
||||
def test_string_slice_get_syntax(any_string_dtype):
|
||||
ser = Series(
|
||||
["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
|
||||
result = ser.str[0]
|
||||
expected = ser.str.get(0)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str[:3]
|
||||
expected = ser.str.slice(stop=3)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
result = ser.str[2::-1]
|
||||
expected = ser.str.slice(start=2, step=-1)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_slice_out_of_bounds_nested():
|
||||
ser = Series([(1, 2), (1,), (3, 4, 5)])
|
||||
result = ser.str[1]
|
||||
expected = Series([2, np.nan, 4])
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_string_slice_out_of_bounds(any_string_dtype):
|
||||
ser = Series(["foo", "b", "ba"], dtype=any_string_dtype)
|
||||
result = ser.str[1]
|
||||
expected = Series(["o", np.nan, "a"], dtype=any_string_dtype)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_encode_decode(any_string_dtype):
|
||||
ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
|
||||
result = ser.str.decode("utf-8")
|
||||
expected = ser.map(lambda x: x.decode("utf-8")).astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_encode_errors_kwarg(any_string_dtype):
|
||||
ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype)
|
||||
|
||||
msg = (
|
||||
r"'charmap' codec can't encode character '\\x9d' in position 1: "
|
||||
"character maps to <undefined>"
|
||||
)
|
||||
with pytest.raises(UnicodeEncodeError, match=msg):
|
||||
ser.str.encode("cp1252")
|
||||
|
||||
result = ser.str.encode("cp1252", "ignore")
|
||||
expected = ser.map(lambda x: x.encode("cp1252", "ignore"))
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_decode_errors_kwarg():
|
||||
ser = Series([b"a", b"b", b"a\x9d"])
|
||||
|
||||
msg = (
|
||||
"'charmap' codec can't decode byte 0x9d in position 1: "
|
||||
"character maps to <undefined>"
|
||||
)
|
||||
with pytest.raises(UnicodeDecodeError, match=msg):
|
||||
ser.str.decode("cp1252")
|
||||
|
||||
result = ser.str.decode("cp1252", "ignore")
|
||||
expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype(object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"form, expected",
|
||||
[
|
||||
("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]),
|
||||
("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]), # noqa: RUF001
|
||||
],
|
||||
)
|
||||
def test_normalize(form, expected, any_string_dtype):
|
||||
ser = Series(
|
||||
["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001
|
||||
index=["a", "b", "c", "d", "e"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype)
|
||||
result = ser.str.normalize(form)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_normalize_bad_arg_raises(any_string_dtype):
|
||||
ser = Series(
|
||||
["ABC", "ABC", "123", np.nan, "アイエ"], # noqa: RUF001
|
||||
index=["a", "b", "c", "d", "e"],
|
||||
dtype=any_string_dtype,
|
||||
)
|
||||
with pytest.raises(ValueError, match="invalid normalization form"):
|
||||
ser.str.normalize("xxx")
|
||||
|
||||
|
||||
def test_normalize_index():
|
||||
idx = Index(["ABC", "123", "アイエ"]) # noqa: RUF001
|
||||
expected = Index(["ABC", "123", "アイエ"])
|
||||
result = idx.str.normalize("NFKC")
|
||||
tm.assert_index_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values,inferred_type",
|
||||
[
|
||||
(["a", "b"], "string"),
|
||||
(["a", "b", 1], "mixed-integer"),
|
||||
(["a", "b", 1.3], "mixed"),
|
||||
(["a", "b", 1.3, 1], "mixed-integer"),
|
||||
(["aa", datetime(2011, 1, 1)], "mixed"),
|
||||
],
|
||||
)
|
||||
def test_index_str_accessor_visibility(values, inferred_type, index_or_series):
|
||||
obj = index_or_series(values)
|
||||
if index_or_series is Index:
|
||||
assert obj.inferred_type == inferred_type
|
||||
|
||||
assert isinstance(obj.str, StringMethods)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"values,inferred_type",
|
||||
[
|
||||
([1, np.nan], "floating"),
|
||||
([datetime(2011, 1, 1)], "datetime64"),
|
||||
([timedelta(1)], "timedelta64"),
|
||||
],
|
||||
)
|
||||
def test_index_str_accessor_non_string_values_raises(
|
||||
values, inferred_type, index_or_series
|
||||
):
|
||||
obj = index_or_series(values)
|
||||
if index_or_series is Index:
|
||||
assert obj.inferred_type == inferred_type
|
||||
|
||||
msg = "Can only use .str accessor with string values"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
obj.str
|
||||
|
||||
|
||||
def test_index_str_accessor_multiindex_raises():
|
||||
# MultiIndex has mixed dtype, but not allow to use accessor
|
||||
idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
|
||||
assert idx.inferred_type == "mixed"
|
||||
|
||||
msg = "Can only use .str accessor with Index, not MultiIndex"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
idx.str
|
||||
|
||||
|
||||
def test_str_accessor_no_new_attributes(any_string_dtype):
|
||||
# https://github.com/pandas-dev/pandas/issues/10673
|
||||
ser = Series(list("aabbcde"), dtype=any_string_dtype)
|
||||
with pytest.raises(AttributeError, match="You cannot add any new attribute"):
|
||||
ser.str.xlabel = "a"
|
||||
|
||||
|
||||
def test_cat_on_bytes_raises():
|
||||
lhs = Series(np.array(list("abc"), "S1").astype(object))
|
||||
rhs = Series(np.array(list("def"), "S1").astype(object))
|
||||
msg = "Cannot use .str.cat with values of inferred dtype 'bytes'"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
lhs.str.cat(rhs)
|
||||
|
||||
|
||||
def test_str_accessor_in_apply_func():
|
||||
# https://github.com/pandas-dev/pandas/issues/38979
|
||||
df = DataFrame(zip("abc", "def"))
|
||||
expected = Series(["A/D", "B/E", "C/F"])
|
||||
result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_zfill():
|
||||
# https://github.com/pandas-dev/pandas/issues/20868
|
||||
value = Series(["-1", "1", "1000", 10, np.nan])
|
||||
expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object)
|
||||
tm.assert_series_equal(value.str.zfill(3), expected)
|
||||
|
||||
value = Series(["-2", "+5"])
|
||||
expected = Series(["-0002", "+0005"])
|
||||
tm.assert_series_equal(value.str.zfill(5), expected)
|
||||
|
||||
|
||||
def test_zfill_with_non_integer_argument():
|
||||
value = Series(["-2", "+5"])
|
||||
wid = "a"
|
||||
msg = f"width must be of integer type, not {type(wid).__name__}"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
value.str.zfill(wid)
|
||||
|
||||
|
||||
def test_zfill_with_leading_sign():
|
||||
value = Series(["-cat", "-1", "+dog"])
|
||||
expected = Series(["-0cat", "-0001", "+0dog"])
|
||||
tm.assert_series_equal(value.str.zfill(5), expected)
|
||||
|
||||
|
||||
def test_get_with_dict_label():
|
||||
# GH47911
|
||||
s = Series(
|
||||
[
|
||||
{"name": "Hello", "value": "World"},
|
||||
{"name": "Goodbye", "value": "Planet"},
|
||||
{"value": "Sea"},
|
||||
]
|
||||
)
|
||||
result = s.str.get("name")
|
||||
expected = Series(["Hello", "Goodbye", None], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
result = s.str.get("value")
|
||||
expected = Series(["World", "Planet", "Sea"], dtype=object)
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_series_str_decode():
|
||||
# GH 22613
|
||||
result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict")
|
||||
expected = Series(["x", "y"], dtype="object")
|
||||
tm.assert_series_equal(result, expected)
|
Reference in New Issue
Block a user