forked from Alsan/Post_finder
venv
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,9 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(params=["split", "records", "index", "columns", "values"])
|
||||
def orient(request):
|
||||
"""
|
||||
Fixture for orients excluding the table format.
|
||||
"""
|
||||
return request.param
|
@ -0,0 +1,130 @@
|
||||
from io import (
|
||||
BytesIO,
|
||||
StringIO,
|
||||
)
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def test_compression_roundtrip(compression):
|
||||
df = pd.DataFrame(
|
||||
[[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
|
||||
index=["A", "B"],
|
||||
columns=["X", "Y", "Z"],
|
||||
)
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
tm.assert_frame_equal(df, pd.read_json(path, compression=compression))
|
||||
|
||||
# explicitly ensure file was compressed.
|
||||
with tm.decompress_file(path, compression) as fh:
|
||||
result = fh.read().decode("utf8")
|
||||
data = StringIO(result)
|
||||
tm.assert_frame_equal(df, pd.read_json(data))
|
||||
|
||||
|
||||
def test_read_zipped_json(datapath):
|
||||
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
|
||||
uncompressed_df = pd.read_json(uncompressed_path)
|
||||
|
||||
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
|
||||
compressed_df = pd.read_json(compressed_path, compression="zip")
|
||||
|
||||
tm.assert_frame_equal(uncompressed_df, compressed_df)
|
||||
|
||||
|
||||
@td.skip_if_not_us_locale
|
||||
@pytest.mark.single_cpu
|
||||
def test_with_s3_url(compression, s3_public_bucket, s3so):
|
||||
# Bucket created in tests/io/conftest.py
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
|
||||
with tm.ensure_clean() as path:
|
||||
df.to_json(path, compression=compression)
|
||||
with open(path, "rb") as f:
|
||||
s3_public_bucket.put_object(Key="test-1", Body=f)
|
||||
|
||||
roundtripped_df = pd.read_json(
|
||||
f"s3://{s3_public_bucket.name}/test-1",
|
||||
compression=compression,
|
||||
storage_options=s3so,
|
||||
)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_lines_with_compression(compression):
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
roundtripped_df = pd.read_json(path, lines=True, compression=compression)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_chunksize_with_compression(compression):
|
||||
with tm.ensure_clean() as path:
|
||||
df = pd.read_json(StringIO('{"a": ["foo", "bar", "baz"], "b": [4, 5, 6]}'))
|
||||
df.to_json(path, orient="records", lines=True, compression=compression)
|
||||
|
||||
with pd.read_json(
|
||||
path, lines=True, chunksize=1, compression=compression
|
||||
) as res:
|
||||
roundtripped_df = pd.concat(res)
|
||||
tm.assert_frame_equal(df, roundtripped_df)
|
||||
|
||||
|
||||
def test_write_unsupported_compression_type():
|
||||
df = pd.read_json(StringIO('{"a": [1, 2, 3], "b": [4, 5, 6]}'))
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(path, compression="unsupported")
|
||||
|
||||
|
||||
def test_read_unsupported_compression_type():
|
||||
with tm.ensure_clean() as path:
|
||||
msg = "Unrecognized compression type: unsupported"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
pd.read_json(path, compression="unsupported")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
|
||||
)
|
||||
@pytest.mark.parametrize("to_infer", [True, False])
|
||||
@pytest.mark.parametrize("read_infer", [True, False])
|
||||
def test_to_json_compression(
|
||||
compression_only, read_infer, to_infer, compression_to_extension, infer_string
|
||||
):
|
||||
with pd.option_context("future.infer_string", infer_string):
|
||||
# see gh-15008
|
||||
compression = compression_only
|
||||
|
||||
# We'll complete file extension subsequently.
|
||||
filename = "test."
|
||||
filename += compression_to_extension[compression]
|
||||
|
||||
df = pd.DataFrame({"A": [1]})
|
||||
|
||||
to_compression = "infer" if to_infer else compression
|
||||
read_compression = "infer" if read_infer else compression
|
||||
|
||||
with tm.ensure_clean(filename) as path:
|
||||
df.to_json(path, compression=to_compression)
|
||||
result = pd.read_json(path, compression=read_compression)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_to_json_compression_mode(compression):
|
||||
# GH 39985 (read_json does not support user-provided binary files)
|
||||
expected = pd.DataFrame({"A": [1]})
|
||||
|
||||
with BytesIO() as buffer:
|
||||
expected.to_json(buffer, compression=compression)
|
||||
# df = pd.read_json(buffer, compression=compression)
|
||||
# tm.assert_frame_equal(expected, df)
|
@ -0,0 +1,21 @@
|
||||
"""
|
||||
Tests for the deprecated keyword arguments for `read_json`.
|
||||
"""
|
||||
from io import StringIO
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json import read_json
|
||||
|
||||
|
||||
def test_good_kwargs():
|
||||
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
|
||||
|
||||
with tm.assert_produces_warning(None):
|
||||
data1 = StringIO(df.to_json(orient="split"))
|
||||
tm.assert_frame_equal(df, read_json(data1, orient="split"))
|
||||
data2 = StringIO(df.to_json(orient="columns"))
|
||||
tm.assert_frame_equal(df, read_json(data2, orient="columns"))
|
||||
data3 = StringIO(df.to_json(orient="index"))
|
||||
tm.assert_frame_equal(df, read_json(data3, orient="index"))
|
@ -0,0 +1,873 @@
|
||||
"""Tests for Table Schema integration."""
|
||||
from collections import OrderedDict
|
||||
from io import StringIO
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.core.dtypes.dtypes import (
|
||||
CategoricalDtype,
|
||||
DatetimeTZDtype,
|
||||
PeriodDtype,
|
||||
)
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
convert_json_field_to_pandas_type,
|
||||
convert_pandas_type_to_json_field,
|
||||
set_default_names,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_schema():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1h", periods=4, freq="min"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_table():
|
||||
return DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
"D": pd.timedelta_range("1h", periods=4, freq="min"),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.0, 2.0, 3, 4.0],
|
||||
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
|
||||
class TestBuildSchema:
|
||||
def test_build_table_schema(self, df_schema, using_infer_string):
|
||||
result = build_table_schema(df_schema, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["idx"],
|
||||
}
|
||||
if using_infer_string:
|
||||
expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
|
||||
assert result == expected
|
||||
result = build_table_schema(df_schema)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series(self):
|
||||
s = pd.Series([1, 2, 3], name="foo")
|
||||
result = build_table_schema(s, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "foo", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(s)
|
||||
assert "pandas_version" in result
|
||||
|
||||
def test_series_unnamed(self):
|
||||
result = build_table_schema(pd.Series([1, 2, 3]), version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_multiindex(self, df_schema, using_infer_string):
|
||||
df = df_schema
|
||||
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
|
||||
df.index = idx
|
||||
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "level_0", "type": "string"},
|
||||
{"name": "level_1", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
],
|
||||
"primaryKey": ["level_0", "level_1"],
|
||||
}
|
||||
if using_infer_string:
|
||||
expected["fields"][0] = {
|
||||
"name": "level_0",
|
||||
"type": "any",
|
||||
"extDtype": "string",
|
||||
}
|
||||
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
|
||||
assert result == expected
|
||||
|
||||
df.index.names = ["idx0", None]
|
||||
expected["fields"][0]["name"] = "idx0"
|
||||
expected["primaryKey"] = ["idx0", "level_1"]
|
||||
result = build_table_schema(df, version=False)
|
||||
assert result == expected
|
||||
|
||||
|
||||
class TestTableSchemaType:
|
||||
@pytest.mark.parametrize("int_type", [int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_data(self, int_type):
|
||||
int_data = [1, 2, 3]
|
||||
assert as_json_table_type(np.array(int_data, dtype=int_type).dtype) == "integer"
|
||||
|
||||
@pytest.mark.parametrize("float_type", [float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_data(self, float_type):
|
||||
float_data = [1.0, 2.0, 3.0]
|
||||
assert (
|
||||
as_json_table_type(np.array(float_data, dtype=float_type).dtype) == "number"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("bool_type", [bool, np.bool_])
|
||||
def test_as_json_table_type_bool_data(self, bool_type):
|
||||
bool_data = [True, False]
|
||||
assert (
|
||||
as_json_table_type(np.array(bool_data, dtype=bool_type).dtype) == "boolean"
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_data",
|
||||
[
|
||||
pd.to_datetime(["2016"]),
|
||||
pd.to_datetime(["2016"], utc=True),
|
||||
pd.Series(pd.to_datetime(["2016"])),
|
||||
pd.Series(pd.to_datetime(["2016"], utc=True)),
|
||||
pd.period_range("2016", freq="Y", periods=3),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_data(self, date_data):
|
||||
assert as_json_table_type(date_data.dtype) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"str_data",
|
||||
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
|
||||
)
|
||||
def test_as_json_table_type_string_data(self, str_data):
|
||||
assert as_json_table_type(str_data.dtype) == "string"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cat_data",
|
||||
[
|
||||
pd.Categorical(["a"]),
|
||||
pd.Categorical([1]),
|
||||
pd.Series(pd.Categorical([1])),
|
||||
pd.CategoricalIndex([1]),
|
||||
pd.Categorical([1]),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_categorical_data(self, cat_data):
|
||||
assert as_json_table_type(cat_data.dtype) == "any"
|
||||
|
||||
# ------
|
||||
# dtypes
|
||||
# ------
|
||||
@pytest.mark.parametrize("int_dtype", [int, np.int16, np.int32, np.int64])
|
||||
def test_as_json_table_type_int_dtypes(self, int_dtype):
|
||||
assert as_json_table_type(int_dtype) == "integer"
|
||||
|
||||
@pytest.mark.parametrize("float_dtype", [float, np.float16, np.float32, np.float64])
|
||||
def test_as_json_table_type_float_dtypes(self, float_dtype):
|
||||
assert as_json_table_type(float_dtype) == "number"
|
||||
|
||||
@pytest.mark.parametrize("bool_dtype", [bool, np.bool_])
|
||||
def test_as_json_table_type_bool_dtypes(self, bool_dtype):
|
||||
assert as_json_table_type(bool_dtype) == "boolean"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"date_dtype",
|
||||
[
|
||||
np.dtype("<M8[ns]"),
|
||||
PeriodDtype("D"),
|
||||
DatetimeTZDtype("ns", "US/Central"),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_date_dtypes(self, date_dtype):
|
||||
# TODO: datedate.date? datetime.time?
|
||||
assert as_json_table_type(date_dtype) == "datetime"
|
||||
|
||||
@pytest.mark.parametrize("td_dtype", [np.dtype("<m8[ns]")])
|
||||
def test_as_json_table_type_timedelta_dtypes(self, td_dtype):
|
||||
assert as_json_table_type(td_dtype) == "duration"
|
||||
|
||||
@pytest.mark.parametrize("str_dtype", [object]) # TODO(GH#14904) flesh out dtypes?
|
||||
def test_as_json_table_type_string_dtypes(self, str_dtype):
|
||||
assert as_json_table_type(str_dtype) == "string"
|
||||
|
||||
def test_as_json_table_type_categorical_dtypes(self):
|
||||
assert as_json_table_type(pd.Categorical(["a"]).dtype) == "any"
|
||||
assert as_json_table_type(CategoricalDtype()) == "any"
|
||||
|
||||
|
||||
class TestTableOrient:
|
||||
def test_build_series(self):
|
||||
s = pd.Series([1, 2], name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [{"name": "id", "type": "integer"}, {"name": "a", "type": "integer"}]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("id", 0), ("a", 1)]),
|
||||
OrderedDict([("id", 1), ("a", 2)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_read_json_from_to_json_results(self):
|
||||
# GH32383
|
||||
df = DataFrame(
|
||||
{
|
||||
"_id": {"row_0": 0},
|
||||
"category": {"row_0": "Goods"},
|
||||
"recommender_id": {"row_0": 3},
|
||||
"recommender_name_jp": {"row_0": "浦田"},
|
||||
"recommender_name_en": {"row_0": "Urata"},
|
||||
"name_jp": {"row_0": "博多人形(松尾吉将まつお よしまさ)"},
|
||||
"name_en": {"row_0": "Hakata Dolls Matsuo"},
|
||||
}
|
||||
)
|
||||
|
||||
result1 = pd.read_json(StringIO(df.to_json()))
|
||||
result2 = DataFrame.from_dict(json.loads(df.to_json()))
|
||||
tm.assert_frame_equal(result1, df)
|
||||
tm.assert_frame_equal(result2, df)
|
||||
|
||||
def test_to_json(self, df_table, using_infer_string):
|
||||
df = df_table
|
||||
df.index.name = "idx"
|
||||
result = df.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{"name": "A", "type": "integer"},
|
||||
{"name": "B", "type": "string"},
|
||||
{"name": "C", "type": "datetime"},
|
||||
{"name": "D", "type": "duration"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "E",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"name": "F",
|
||||
"ordered": True,
|
||||
"type": "any",
|
||||
},
|
||||
{"name": "G", "type": "number"},
|
||||
{"name": "H", "type": "datetime", "tz": "US/Central"},
|
||||
]
|
||||
|
||||
if using_infer_string:
|
||||
fields[2] = {"name": "B", "type": "any", "extDtype": "string"}
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["idx"]}
|
||||
data = [
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 0),
|
||||
("A", 1),
|
||||
("B", "a"),
|
||||
("C", "2016-01-01T00:00:00.000"),
|
||||
("D", "P0DT1H0M0S"),
|
||||
("E", "a"),
|
||||
("F", "a"),
|
||||
("G", 1.0),
|
||||
("H", "2016-01-01T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 1),
|
||||
("A", 2),
|
||||
("B", "b"),
|
||||
("C", "2016-01-02T00:00:00.000"),
|
||||
("D", "P0DT1H1M0S"),
|
||||
("E", "b"),
|
||||
("F", "b"),
|
||||
("G", 2.0),
|
||||
("H", "2016-01-02T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 2),
|
||||
("A", 3),
|
||||
("B", "c"),
|
||||
("C", "2016-01-03T00:00:00.000"),
|
||||
("D", "P0DT1H2M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 3.0),
|
||||
("H", "2016-01-03T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 3),
|
||||
("A", 4),
|
||||
("B", "c"),
|
||||
("C", "2016-01-04T00:00:00.000"),
|
||||
("D", "P0DT1H3M0S"),
|
||||
("E", "c"),
|
||||
("F", "c"),
|
||||
("G", 4.0),
|
||||
("H", "2016-01-04T06:00:00.000Z"),
|
||||
]
|
||||
),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_float_index(self):
|
||||
data = pd.Series(1, index=[1.0, 2.0])
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{"name": "index", "type": "number"},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", 1.0), ("values", 1)]),
|
||||
OrderedDict([("index", 2.0), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_period_index(self):
|
||||
idx = pd.period_range("2016", freq="Q-JAN", periods=2)
|
||||
data = pd.Series(1, idx)
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"freq": "QE-JAN", "name": "index", "type": "datetime"},
|
||||
{"name": "values", "type": "integer"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["index"]}
|
||||
data = [
|
||||
OrderedDict([("index", "2015-11-01T00:00:00.000"), ("values", 1)]),
|
||||
OrderedDict([("index", "2016-02-01T00:00:00.000"), ("values", 1)]),
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json_categorical_index(self):
|
||||
data = pd.Series(1, pd.CategoricalIndex(["a", "b"]))
|
||||
result = data.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
(
|
||||
"schema",
|
||||
{
|
||||
"fields": [
|
||||
{
|
||||
"name": "index",
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"ordered": False,
|
||||
},
|
||||
{"name": "values", "type": "integer"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
},
|
||||
),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("index", "a"), ("values", 1)]),
|
||||
OrderedDict([("index", "b"), ("values", 1)]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_date_format_raises(self, df_table):
|
||||
msg = (
|
||||
"Trying to write with `orient='table'` and `date_format='epoch'`. Table "
|
||||
"Schema requires dates to be formatted with `date_format='iso'`"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df_table.to_json(orient="table", date_format="epoch")
|
||||
|
||||
# others work
|
||||
df_table.to_json(orient="table", date_format="iso")
|
||||
df_table.to_json(orient="table")
|
||||
|
||||
def test_convert_pandas_type_to_json_field_int(self, index_or_series):
|
||||
kind = index_or_series
|
||||
data = [1, 2, 3]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "integer"}
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_field_float(self, index_or_series):
|
||||
kind = index_or_series
|
||||
data = [1.0, 2.0, 3.0]
|
||||
result = convert_pandas_type_to_json_field(kind(data, name="name"))
|
||||
expected = {"name": "name", "type": "number"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dt_args,extra_exp", [({}, {}), ({"utc": True}, {"tz": "UTC"})]
|
||||
)
|
||||
@pytest.mark.parametrize("wrapper", [None, pd.Series])
|
||||
def test_convert_pandas_type_to_json_field_datetime(
|
||||
self, dt_args, extra_exp, wrapper
|
||||
):
|
||||
data = [1.0, 2.0, 3.0]
|
||||
data = pd.to_datetime(data, **dt_args)
|
||||
if wrapper is pd.Series:
|
||||
data = pd.Series(data, name="values")
|
||||
result = convert_pandas_type_to_json_field(data)
|
||||
expected = {"name": "values", "type": "datetime"}
|
||||
expected.update(extra_exp)
|
||||
assert result == expected
|
||||
|
||||
def test_convert_pandas_type_to_json_period_range(self):
|
||||
arr = pd.period_range("2016", freq="Y-DEC", periods=4)
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {"name": "values", "type": "datetime", "freq": "YE-DEC"}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize("kind", [pd.Categorical, pd.CategoricalIndex])
|
||||
@pytest.mark.parametrize("ordered", [True, False])
|
||||
def test_convert_pandas_type_to_json_field_categorical(self, kind, ordered):
|
||||
data = ["a", "b", "c"]
|
||||
if kind is pd.Categorical:
|
||||
arr = pd.Series(kind(data, ordered=ordered), name="cats")
|
||||
elif kind is pd.CategoricalIndex:
|
||||
arr = kind(data, ordered=ordered, name="cats")
|
||||
|
||||
result = convert_pandas_type_to_json_field(arr)
|
||||
expected = {
|
||||
"name": "cats",
|
||||
"type": "any",
|
||||
"constraints": {"enum": data},
|
||||
"ordered": ordered,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"inp,exp",
|
||||
[
|
||||
({"type": "integer"}, "int64"),
|
||||
({"type": "number"}, "float64"),
|
||||
({"type": "boolean"}, "bool"),
|
||||
({"type": "duration"}, "timedelta64"),
|
||||
({"type": "datetime"}, "datetime64[ns]"),
|
||||
({"type": "datetime", "tz": "US/Hawaii"}, "datetime64[ns, US/Hawaii]"),
|
||||
({"type": "any"}, "object"),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": False,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=False),
|
||||
),
|
||||
(
|
||||
{
|
||||
"type": "any",
|
||||
"constraints": {"enum": ["a", "b", "c"]},
|
||||
"ordered": True,
|
||||
},
|
||||
CategoricalDtype(categories=["a", "b", "c"], ordered=True),
|
||||
),
|
||||
({"type": "string"}, "object"),
|
||||
],
|
||||
)
|
||||
def test_convert_json_field_to_pandas_type(self, inp, exp):
|
||||
field = {"name": "foo"}
|
||||
field.update(inp)
|
||||
assert convert_json_field_to_pandas_type(field) == exp
|
||||
|
||||
@pytest.mark.parametrize("inp", ["geopoint", "geojson", "fake_type"])
|
||||
def test_convert_json_field_to_pandas_type_raises(self, inp):
|
||||
field = {"type": inp}
|
||||
with pytest.raises(
|
||||
ValueError, match=f"Unsupported or invalid field type: {inp}"
|
||||
):
|
||||
convert_json_field_to_pandas_type(field)
|
||||
|
||||
def test_categorical(self):
|
||||
s = pd.Series(pd.Categorical(["a", "b", "a"]))
|
||||
s.index.name = "idx"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "idx", "type": "integer"},
|
||||
{
|
||||
"constraints": {"enum": ["a", "b"]},
|
||||
"name": "values",
|
||||
"ordered": False,
|
||||
"type": "any",
|
||||
},
|
||||
]
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", {"fields": fields, "primaryKey": ["idx"]}),
|
||||
(
|
||||
"data",
|
||||
[
|
||||
OrderedDict([("idx", 0), ("values", "a")]),
|
||||
OrderedDict([("idx", 1), ("values", "b")]),
|
||||
OrderedDict([("idx", 2), ("values", "a")]),
|
||||
],
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx,nm,prop",
|
||||
[
|
||||
(pd.Index([1]), "index", "name"),
|
||||
(pd.Index([1], name="myname"), "myname", "name"),
|
||||
(
|
||||
pd.MultiIndex.from_product([("a", "b"), ("c", "d")]),
|
||||
["level_0", "level_1"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", "n2"]
|
||||
),
|
||||
["n1", "n2"],
|
||||
"names",
|
||||
),
|
||||
(
|
||||
pd.MultiIndex.from_product(
|
||||
[("a", "b"), ("c", "d")], names=["n1", None]
|
||||
),
|
||||
["n1", "level_1"],
|
||||
"names",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_set_names_unset(self, idx, nm, prop):
|
||||
data = pd.Series(1, idx)
|
||||
result = set_default_names(data)
|
||||
assert getattr(result.index, prop) == nm
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index([], name="index"),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("level_0", "level_1")),
|
||||
pd.MultiIndex.from_arrays([["foo"], ["bar"]], names=("foo", "level_1")),
|
||||
],
|
||||
)
|
||||
def test_warns_non_roundtrippable_names(self, idx):
|
||||
# GH 19130
|
||||
df = DataFrame(index=idx)
|
||||
df.index.name = "index"
|
||||
with tm.assert_produces_warning():
|
||||
set_default_names(df)
|
||||
|
||||
def test_timestamp_in_columns(self):
|
||||
df = DataFrame(
|
||||
[[1, 2]], columns=[pd.Timestamp("2016"), pd.Timedelta(10, unit="s")]
|
||||
)
|
||||
result = df.to_json(orient="table")
|
||||
js = json.loads(result)
|
||||
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000"
|
||||
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"case",
|
||||
[
|
||||
pd.Series([1], index=pd.Index([1], name="a"), name="a"),
|
||||
DataFrame({"A": [1]}, index=pd.Index([1], name="A")),
|
||||
DataFrame(
|
||||
{"A": [1]},
|
||||
index=pd.MultiIndex.from_arrays([["a"], [1]], names=["A", "a"]),
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_overlapping_names(self, case):
|
||||
with pytest.raises(ValueError, match="Overlapping"):
|
||||
case.to_json(orient="table")
|
||||
|
||||
def test_mi_falsey_name(self):
|
||||
# GH 16203
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((4, 4)),
|
||||
index=pd.MultiIndex.from_product([("A", "B"), ("a", "b")]),
|
||||
)
|
||||
result = [x["name"] for x in build_table_schema(df)["fields"]]
|
||||
assert result == ["level_0", "level_1", 0, 1, 2, 3]
|
||||
|
||||
|
||||
class TestTableOrientReader:
|
||||
@pytest.mark.parametrize(
|
||||
"index_nm",
|
||||
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"ints": [1, 2, 3, 4]},
|
||||
{"objects": ["a", "b", "c", "d"]},
|
||||
{"objects": ["1", "2", "3", "4"]},
|
||||
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
|
||||
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
|
||||
{
|
||||
"ordered_cats": pd.Series(
|
||||
pd.Categorical(["a", "b", "c", "c"], ordered=True)
|
||||
)
|
||||
},
|
||||
{"floats": [1.0, 2.0, 3.0, 4.0]},
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"bools": [True, False, False, True]},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2016-01-01", freq="d", periods=4, tz="US/Central"
|
||||
) # added in # GH 35973
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize("index_nm", [None, "idx", "index"])
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[{"timedeltas": pd.timedelta_range("1h", periods=4, freq="min")}],
|
||||
)
|
||||
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
|
||||
out = df.to_json(orient="table")
|
||||
with pytest.raises(NotImplementedError, match="can not yet read "):
|
||||
pd.read_json(out, orient="table")
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_nm",
|
||||
[None, "idx", pytest.param("index", marks=pytest.mark.xfail), "level_0"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"ints": [1, 2, 3, 4]},
|
||||
{"objects": ["a", "b", "c", "d"]},
|
||||
{"objects": ["1", "2", "3", "4"]},
|
||||
{"date_ranges": pd.date_range("2016-01-01", freq="d", periods=4)},
|
||||
{"categoricals": pd.Series(pd.Categorical(["a", "b", "c", "c"]))},
|
||||
{
|
||||
"ordered_cats": pd.Series(
|
||||
pd.Categorical(["a", "b", "c", "c"], ordered=True)
|
||||
)
|
||||
},
|
||||
{"floats": [1.0, 2.0, 3.0, 4.0]},
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"bools": [True, False, False, True]},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2016-01-01", freq="d", periods=4, tz="US/Central"
|
||||
) # added in # GH 35973
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_period_orient(self, index_nm, vals, recwarn):
|
||||
df = DataFrame(
|
||||
vals,
|
||||
index=pd.Index(
|
||||
(pd.Period(f"2022Q{q}") for q in range(1, 5)), name=index_nm
|
||||
),
|
||||
)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
pd.Index(range(4)),
|
||||
pd.date_range(
|
||||
"2020-08-30",
|
||||
freq="d",
|
||||
periods=4,
|
||||
)._with_freq(None),
|
||||
pd.date_range(
|
||||
"2020-08-30", freq="d", periods=4, tz="US/Central"
|
||||
)._with_freq(None),
|
||||
pd.MultiIndex.from_product(
|
||||
[
|
||||
pd.date_range("2020-08-30", freq="d", periods=2, tz="US/Central"),
|
||||
["x", "y"],
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"vals",
|
||||
[
|
||||
{"floats": [1.1, 2.2, 3.3, 4.4]},
|
||||
{"dates": pd.date_range("2020-08-30", freq="d", periods=4)},
|
||||
{
|
||||
"timezones": pd.date_range(
|
||||
"2020-08-30", freq="d", periods=4, tz="Europe/London"
|
||||
)
|
||||
},
|
||||
],
|
||||
)
|
||||
def test_read_json_table_timezones_orient(self, idx, vals, recwarn):
|
||||
# GH 35973
|
||||
df = DataFrame(vals, index=idx)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_comprehensive(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "c"],
|
||||
"C": pd.date_range("2016-01-01", freq="d", periods=4),
|
||||
# 'D': pd.timedelta_range('1h', periods=4, freq='min'),
|
||||
"E": pd.Series(pd.Categorical(["a", "b", "c", "c"])),
|
||||
"F": pd.Series(pd.Categorical(["a", "b", "c", "c"], ordered=True)),
|
||||
"G": [1.1, 2.2, 3.3, 4.4],
|
||||
"H": pd.date_range("2016-01-01", freq="d", periods=4, tz="US/Central"),
|
||||
"I": [True, False, False, True],
|
||||
},
|
||||
index=pd.Index(range(4), name="idx"),
|
||||
)
|
||||
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"index_names",
|
||||
[[None, None], ["foo", "bar"], ["foo", None], [None, "foo"], ["index", "foo"]],
|
||||
)
|
||||
def test_multiindex(self, index_names):
|
||||
# GH 18912
|
||||
df = DataFrame(
|
||||
[["Arr", "alpha", [1, 2, 3, 4]], ["Bee", "Beta", [10, 20, 30, 40]]],
|
||||
index=[["A", "B"], ["Null", "Eins"]],
|
||||
columns=["Aussprache", "Griechisch", "Args"],
|
||||
)
|
||||
df.index.names = index_names
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
def test_empty_frame_roundtrip(self):
|
||||
# GH 21287
|
||||
df = DataFrame(columns=["a", "b", "c"])
|
||||
expected = df.copy()
|
||||
out = StringIO(df.to_json(orient="table"))
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
def test_read_json_orient_table_old_schema_version(self):
|
||||
df_json = """
|
||||
{
|
||||
"schema":{
|
||||
"fields":[
|
||||
{"name":"index","type":"integer"},
|
||||
{"name":"a","type":"string"}
|
||||
],
|
||||
"primaryKey":["index"],
|
||||
"pandas_version":"0.20.0"
|
||||
},
|
||||
"data":[
|
||||
{"index":0,"a":1},
|
||||
{"index":1,"a":2.0},
|
||||
{"index":2,"a":"s"}
|
||||
]
|
||||
}
|
||||
"""
|
||||
expected = DataFrame({"a": [1, 2.0, "s"]})
|
||||
result = pd.read_json(StringIO(df_json), orient="table")
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
@pytest.mark.parametrize("freq", ["M", "2M", "Q", "2Q", "Y", "2Y"])
|
||||
def test_read_json_table_orient_period_depr_freq(self, freq, recwarn):
|
||||
# GH#9586
|
||||
df = DataFrame(
|
||||
{"ints": [1, 2]},
|
||||
index=pd.PeriodIndex(["2020-01", "2021-06"], freq=freq),
|
||||
)
|
||||
out = df.to_json(orient="table")
|
||||
result = pd.read_json(out, orient="table")
|
||||
tm.assert_frame_equal(df, result)
|
@ -0,0 +1,317 @@
|
||||
"""Tests for ExtensionDtype Table Schema integration."""
|
||||
|
||||
from collections import OrderedDict
|
||||
import datetime as dt
|
||||
import decimal
|
||||
from io import StringIO
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
NA,
|
||||
DataFrame,
|
||||
Index,
|
||||
array,
|
||||
read_json,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
from pandas.core.arrays.integer import Int64Dtype
|
||||
from pandas.core.arrays.string_ import StringDtype
|
||||
from pandas.core.series import Series
|
||||
from pandas.tests.extension.date import (
|
||||
DateArray,
|
||||
DateDtype,
|
||||
)
|
||||
from pandas.tests.extension.decimal.array import (
|
||||
DecimalArray,
|
||||
DecimalDtype,
|
||||
)
|
||||
|
||||
from pandas.io.json._table_schema import (
|
||||
as_json_table_type,
|
||||
build_table_schema,
|
||||
)
|
||||
|
||||
|
||||
class TestBuildSchema:
|
||||
def test_build_table_schema(self):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": DateArray([dt.date(2021, 10, 10)]),
|
||||
"B": DecimalArray([decimal.Decimal(10)]),
|
||||
"C": array(["pandas"], dtype="string"),
|
||||
"D": array([10], dtype="Int64"),
|
||||
}
|
||||
)
|
||||
result = build_table_schema(df, version=False)
|
||||
expected = {
|
||||
"fields": [
|
||||
{"name": "index", "type": "integer"},
|
||||
{"name": "A", "type": "any", "extDtype": "DateDtype"},
|
||||
{"name": "B", "type": "number", "extDtype": "decimal"},
|
||||
{"name": "C", "type": "any", "extDtype": "string"},
|
||||
{"name": "D", "type": "integer", "extDtype": "Int64"},
|
||||
],
|
||||
"primaryKey": ["index"],
|
||||
}
|
||||
assert result == expected
|
||||
result = build_table_schema(df)
|
||||
assert "pandas_version" in result
|
||||
|
||||
|
||||
class TestTableSchemaType:
|
||||
@pytest.mark.parametrize(
|
||||
"date_data",
|
||||
[
|
||||
DateArray([dt.date(2021, 10, 10)]),
|
||||
DateArray(dt.date(2021, 10, 10)),
|
||||
Series(DateArray(dt.date(2021, 10, 10))),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_date_array_dtype(self, date_data):
|
||||
assert as_json_table_type(date_data.dtype) == "any"
|
||||
|
||||
def test_as_json_table_type_ext_date_dtype(self):
|
||||
assert as_json_table_type(DateDtype()) == "any"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"decimal_data",
|
||||
[
|
||||
DecimalArray([decimal.Decimal(10)]),
|
||||
Series(DecimalArray([decimal.Decimal(10)])),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_decimal_array_dtype(self, decimal_data):
|
||||
assert as_json_table_type(decimal_data.dtype) == "number"
|
||||
|
||||
def test_as_json_table_type_ext_decimal_dtype(self):
|
||||
assert as_json_table_type(DecimalDtype()) == "number"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"string_data",
|
||||
[
|
||||
array(["pandas"], dtype="string"),
|
||||
Series(array(["pandas"], dtype="string")),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_string_array_dtype(self, string_data):
|
||||
assert as_json_table_type(string_data.dtype) == "any"
|
||||
|
||||
def test_as_json_table_type_ext_string_dtype(self):
|
||||
assert as_json_table_type(StringDtype()) == "any"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"integer_data",
|
||||
[
|
||||
array([10], dtype="Int64"),
|
||||
Series(array([10], dtype="Int64")),
|
||||
],
|
||||
)
|
||||
def test_as_json_table_type_ext_integer_array_dtype(self, integer_data):
|
||||
assert as_json_table_type(integer_data.dtype) == "integer"
|
||||
|
||||
def test_as_json_table_type_ext_integer_dtype(self):
|
||||
assert as_json_table_type(Int64Dtype()) == "integer"
|
||||
|
||||
|
||||
class TestTableOrient:
|
||||
@pytest.fixture
|
||||
def da(self):
|
||||
return DateArray([dt.date(2021, 10, 10)])
|
||||
|
||||
@pytest.fixture
|
||||
def dc(self):
|
||||
return DecimalArray([decimal.Decimal(10)])
|
||||
|
||||
@pytest.fixture
|
||||
def sa(self):
|
||||
return array(["pandas"], dtype="string")
|
||||
|
||||
@pytest.fixture
|
||||
def ia(self):
|
||||
return array([10], dtype="Int64")
|
||||
|
||||
@pytest.fixture
|
||||
def df(self, da, dc, sa, ia):
|
||||
return DataFrame(
|
||||
{
|
||||
"A": da,
|
||||
"B": dc,
|
||||
"C": sa,
|
||||
"D": ia,
|
||||
}
|
||||
)
|
||||
|
||||
def test_build_date_series(self, da):
|
||||
s = Series(da, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "any", "extDtype": "DateDtype"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", "2021-10-10T00:00:00.000")])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_decimal_series(self, dc):
|
||||
s = Series(dc, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "number", "extDtype": "decimal"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", 10.0)])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_string_series(self, sa):
|
||||
s = Series(sa, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "any", "extDtype": "string"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", "pandas")])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_build_int64_series(self, ia):
|
||||
s = Series(ia, name="a")
|
||||
s.index.name = "id"
|
||||
result = s.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
{"name": "id", "type": "integer"},
|
||||
{"name": "a", "type": "integer", "extDtype": "Int64"},
|
||||
]
|
||||
|
||||
schema = {"fields": fields, "primaryKey": ["id"]}
|
||||
|
||||
expected = OrderedDict(
|
||||
[
|
||||
("schema", schema),
|
||||
("data", [OrderedDict([("id", 0), ("a", 10)])]),
|
||||
]
|
||||
)
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_to_json(self, df):
|
||||
df = df.copy()
|
||||
df.index.name = "idx"
|
||||
result = df.to_json(orient="table", date_format="iso")
|
||||
result = json.loads(result, object_pairs_hook=OrderedDict)
|
||||
|
||||
assert "pandas_version" in result["schema"]
|
||||
result["schema"].pop("pandas_version")
|
||||
|
||||
fields = [
|
||||
OrderedDict({"name": "idx", "type": "integer"}),
|
||||
OrderedDict({"name": "A", "type": "any", "extDtype": "DateDtype"}),
|
||||
OrderedDict({"name": "B", "type": "number", "extDtype": "decimal"}),
|
||||
OrderedDict({"name": "C", "type": "any", "extDtype": "string"}),
|
||||
OrderedDict({"name": "D", "type": "integer", "extDtype": "Int64"}),
|
||||
]
|
||||
|
||||
schema = OrderedDict({"fields": fields, "primaryKey": ["idx"]})
|
||||
data = [
|
||||
OrderedDict(
|
||||
[
|
||||
("idx", 0),
|
||||
("A", "2021-10-10T00:00:00.000"),
|
||||
("B", 10.0),
|
||||
("C", "pandas"),
|
||||
("D", 10),
|
||||
]
|
||||
)
|
||||
]
|
||||
expected = OrderedDict([("schema", schema), ("data", data)])
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_ext_dtype_reading_roundtrip(self):
|
||||
# GH#40255
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": Series([2, NA], dtype="Int64"),
|
||||
"b": Series([1.5, NA], dtype="Float64"),
|
||||
"c": Series([True, NA], dtype="boolean"),
|
||||
},
|
||||
index=Index([1, NA], dtype="Int64"),
|
||||
)
|
||||
expected = df.copy()
|
||||
data_json = df.to_json(orient="table", indent=4)
|
||||
result = read_json(StringIO(data_json), orient="table")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_json_ext_dtype_reading(self):
|
||||
# GH#40255
|
||||
data_json = """{
|
||||
"schema":{
|
||||
"fields":[
|
||||
{
|
||||
"name":"a",
|
||||
"type":"integer",
|
||||
"extDtype":"Int64"
|
||||
}
|
||||
],
|
||||
},
|
||||
"data":[
|
||||
{
|
||||
"a":2
|
||||
},
|
||||
{
|
||||
"a":null
|
||||
}
|
||||
]
|
||||
}"""
|
||||
result = read_json(StringIO(data_json), orient="table")
|
||||
expected = DataFrame({"a": Series([2, NA], dtype="Int64")})
|
||||
tm.assert_frame_equal(result, expected)
|
@ -0,0 +1,907 @@
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
json_normalize,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._normalize import nested_to_record
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def deep_nested():
|
||||
# deeply nested data
|
||||
return [
|
||||
{
|
||||
"country": "USA",
|
||||
"states": [
|
||||
{
|
||||
"name": "California",
|
||||
"cities": [
|
||||
{"name": "San Francisco", "pop": 12345},
|
||||
{"name": "Los Angeles", "pop": 12346},
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "Ohio",
|
||||
"cities": [
|
||||
{"name": "Columbus", "pop": 1234},
|
||||
{"name": "Cleveland", "pop": 1236},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"country": "Germany",
|
||||
"states": [
|
||||
{"name": "Bayern", "cities": [{"name": "Munich", "pop": 12347}]},
|
||||
{
|
||||
"name": "Nordrhein-Westfalen",
|
||||
"cities": [
|
||||
{"name": "Duesseldorf", "pop": 1238},
|
||||
{"name": "Koeln", "pop": 1239},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_data():
|
||||
return [
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"shortname": "FL",
|
||||
"state": "Florida",
|
||||
},
|
||||
{
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
"info": {"governor": "John Kasich"},
|
||||
"shortname": "OH",
|
||||
"state": "Ohio",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def author_missing_data():
|
||||
return [
|
||||
{"info": None},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def missing_metadata():
|
||||
return [
|
||||
{
|
||||
"name": "Alice",
|
||||
"addresses": [
|
||||
{
|
||||
"number": 9562,
|
||||
"street": "Morris St.",
|
||||
"city": "Massillon",
|
||||
"state": "OH",
|
||||
"zip": 44646,
|
||||
}
|
||||
],
|
||||
"previous_residences": {"cities": [{"city_name": "Foo York City"}]},
|
||||
},
|
||||
{
|
||||
"addresses": [
|
||||
{
|
||||
"number": 8449,
|
||||
"street": "Spring St.",
|
||||
"city": "Elizabethton",
|
||||
"state": "TN",
|
||||
"zip": 37643,
|
||||
}
|
||||
],
|
||||
"previous_residences": {"cities": [{"city_name": "Barmingham"}]},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def max_level_test_input_data():
|
||||
"""
|
||||
input data to test json_normalize with max_level param
|
||||
"""
|
||||
return [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
class TestJSONNormalize:
|
||||
def test_simple_records(self):
|
||||
recs = [
|
||||
{"a": 1, "b": 2, "c": 3},
|
||||
{"a": 4, "b": 5, "c": 6},
|
||||
{"a": 7, "b": 8, "c": 9},
|
||||
{"a": 10, "b": 11, "c": 12},
|
||||
]
|
||||
|
||||
result = json_normalize(recs)
|
||||
expected = DataFrame(recs)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties")
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(state_data, "counties", meta="state")
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_fields_list_type_normalize(self):
|
||||
parse_metadata_fields_list_type = [
|
||||
{"values": [1, 2, 3], "metadata": {"listdata": [1, 2]}}
|
||||
]
|
||||
result = json_normalize(
|
||||
parse_metadata_fields_list_type,
|
||||
record_path=["values"],
|
||||
meta=[["metadata", "listdata"]],
|
||||
)
|
||||
expected = DataFrame(
|
||||
{0: [1, 2, 3], "metadata.listdata": [[1, 2], [1, 2], [1, 2]]}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_empty_array(self):
|
||||
result = json_normalize([])
|
||||
expected = DataFrame()
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data, record_path, exception_type",
|
||||
[
|
||||
([{"a": 0}, {"a": 1}], None, None),
|
||||
({"a": [{"a": 0}, {"a": 1}]}, "a", None),
|
||||
('{"a": [{"a": 0}, {"a": 1}]}', None, NotImplementedError),
|
||||
(None, None, NotImplementedError),
|
||||
],
|
||||
)
|
||||
def test_accepted_input(self, data, record_path, exception_type):
|
||||
if exception_type is not None:
|
||||
with pytest.raises(exception_type, match=""):
|
||||
json_normalize(data, record_path=record_path)
|
||||
else:
|
||||
result = json_normalize(data, record_path=record_path)
|
||||
expected = DataFrame([0, 1], columns=["a"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_simple_normalize_with_separator(self, deep_nested):
|
||||
# GH 14883
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}})
|
||||
expected = DataFrame([[1, 2]], columns=["A.A", "A.B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="_")
|
||||
expected = DataFrame([[1, 2]], columns=["A_A", "A_B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize({"A": {"A": 1, "B": 2}}, sep="\u03c3")
|
||||
expected = DataFrame([[1, 2]], columns=["A\u03c3A", "A\u03c3B"])
|
||||
tm.assert_frame_equal(result.reindex_like(expected), expected)
|
||||
|
||||
result = json_normalize(
|
||||
deep_nested,
|
||||
["states", "cities"],
|
||||
meta=["country", ["states", "name"]],
|
||||
sep="_",
|
||||
)
|
||||
expected = Index(["name", "pop", "country", "states_name"]).sort_values()
|
||||
assert result.columns.sort_values().equals(expected)
|
||||
|
||||
def test_normalize_with_multichar_separator(self):
|
||||
# GH #43831
|
||||
data = {"a": [1, 2], "b": {"b_1": 2, "b_2": (3, 4)}}
|
||||
result = json_normalize(data, sep="__")
|
||||
expected = DataFrame([[[1, 2], 2, (3, 4)]], columns=["a", "b__b_1", "b__b_2"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_value_array_record_prefix(self):
|
||||
# GH 21536
|
||||
result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
|
||||
expected = DataFrame([[1], [2]], columns=["Prefix.0"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_object_record_path(self):
|
||||
# GH 22706
|
||||
data = {
|
||||
"state": "Florida",
|
||||
"info": {
|
||||
"governor": "Rick Scott",
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
}
|
||||
result = json_normalize(data, record_path=["info", "counties"])
|
||||
expected = DataFrame(
|
||||
[["Dade", 12345], ["Broward", 40000], ["Palm Beach", 60000]],
|
||||
columns=["name", "population"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_more_deeply_nested(self, deep_nested):
|
||||
result = json_normalize(
|
||||
deep_nested, ["states", "cities"], meta=["country", ["states", "name"]]
|
||||
)
|
||||
ex_data = {
|
||||
"country": ["USA"] * 4 + ["Germany"] * 3,
|
||||
"states.name": [
|
||||
"California",
|
||||
"California",
|
||||
"Ohio",
|
||||
"Ohio",
|
||||
"Bayern",
|
||||
"Nordrhein-Westfalen",
|
||||
"Nordrhein-Westfalen",
|
||||
],
|
||||
"name": [
|
||||
"San Francisco",
|
||||
"Los Angeles",
|
||||
"Columbus",
|
||||
"Cleveland",
|
||||
"Munich",
|
||||
"Duesseldorf",
|
||||
"Koeln",
|
||||
],
|
||||
"pop": [12345, 12346, 1234, 1236, 12347, 1238, 1239],
|
||||
}
|
||||
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_shallow_nested(self):
|
||||
data = [
|
||||
{
|
||||
"state": "Florida",
|
||||
"shortname": "FL",
|
||||
"info": {"governor": "Rick Scott"},
|
||||
"counties": [
|
||||
{"name": "Dade", "population": 12345},
|
||||
{"name": "Broward", "population": 40000},
|
||||
{"name": "Palm Beach", "population": 60000},
|
||||
],
|
||||
},
|
||||
{
|
||||
"state": "Ohio",
|
||||
"shortname": "OH",
|
||||
"info": {"governor": "John Kasich"},
|
||||
"counties": [
|
||||
{"name": "Summit", "population": 1234},
|
||||
{"name": "Cuyahoga", "population": 1337},
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
data, "counties", ["state", "shortname", ["info", "governor"]]
|
||||
)
|
||||
ex_data = {
|
||||
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
|
||||
"state": ["Florida"] * 3 + ["Ohio"] * 2,
|
||||
"shortname": ["FL", "FL", "FL", "OH", "OH"],
|
||||
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
|
||||
"population": [12345, 40000, 60000, 1234, 1337],
|
||||
}
|
||||
expected = DataFrame(ex_data, columns=result.columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_nested_meta_path_with_nested_record_path(self, state_data):
|
||||
# GH 27220
|
||||
result = json_normalize(
|
||||
data=state_data,
|
||||
record_path=["counties"],
|
||||
meta=["state", "shortname", ["info", "governor"]],
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
ex_data = {
|
||||
"name": ["Dade", "Broward", "Palm Beach", "Summit", "Cuyahoga"],
|
||||
"population": [12345, 40000, 60000, 1234, 1337],
|
||||
"state": ["Florida"] * 3 + ["Ohio"] * 2,
|
||||
"shortname": ["FL"] * 3 + ["OH"] * 2,
|
||||
"info.governor": ["Rick Scott"] * 3 + ["John Kasich"] * 2,
|
||||
}
|
||||
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_meta_name_conflict(self):
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
msg = r"Conflicting metadata name (foo|bar), need distinguishing prefix"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
json_normalize(data, "data", meta=["foo", "bar"])
|
||||
|
||||
result = json_normalize(data, "data", meta=["foo", "bar"], meta_prefix="meta")
|
||||
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_meta_parameter_not_modified(self):
|
||||
# GH 18610
|
||||
data = [
|
||||
{
|
||||
"foo": "hello",
|
||||
"bar": "there",
|
||||
"data": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
COLUMNS = ["foo", "bar"]
|
||||
result = json_normalize(data, "data", meta=COLUMNS, meta_prefix="meta")
|
||||
|
||||
assert COLUMNS == ["foo", "bar"]
|
||||
for val in ["metafoo", "metabar", "foo", "bar"]:
|
||||
assert val in result
|
||||
|
||||
def test_record_prefix(self, state_data):
|
||||
result = json_normalize(state_data[0], "counties")
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = json_normalize(
|
||||
state_data, "counties", meta="state", record_prefix="county_"
|
||||
)
|
||||
|
||||
expected = []
|
||||
for rec in state_data:
|
||||
expected.extend(rec["counties"])
|
||||
expected = DataFrame(expected)
|
||||
expected = expected.rename(columns=lambda x: "county_" + x)
|
||||
expected["state"] = np.array(["Florida", "Ohio"]).repeat([3, 2])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_non_ascii_key(self):
|
||||
testjson = (
|
||||
b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},'
|
||||
b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]'
|
||||
).decode("utf8")
|
||||
|
||||
testdata = {
|
||||
b"\xc3\x9cnic\xc3\xb8de".decode("utf8"): [0, 1],
|
||||
"sub.A": [1, 3],
|
||||
"sub.B": [2, 4],
|
||||
}
|
||||
expected = DataFrame(testdata)
|
||||
|
||||
result = json_normalize(json.loads(testjson))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_field(self, author_missing_data):
|
||||
# GH20030:
|
||||
result = json_normalize(author_missing_data)
|
||||
ex_data = [
|
||||
{
|
||||
"info": np.nan,
|
||||
"info.created_at": np.nan,
|
||||
"info.last_updated": np.nan,
|
||||
"author_name.first": np.nan,
|
||||
"author_name.last_name": np.nan,
|
||||
},
|
||||
{
|
||||
"info": None,
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
},
|
||||
]
|
||||
expected = DataFrame(ex_data)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level,expected",
|
||||
[
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField.Id": "ID001",
|
||||
"UserField.Name": "Name001",
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Image": {"a": "b"},
|
||||
},
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_max_level_with_records_path(self, max_level, expected):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
test_input = [
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": [
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
{
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
],
|
||||
"Image": {"a": "b"},
|
||||
"tags": [
|
||||
{"foo": "something", "bar": "else"},
|
||||
{"foo": "something2", "bar": "else2"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
result = json_normalize(
|
||||
test_input,
|
||||
record_path=["Lookup"],
|
||||
meta=[["CreatedBy"], ["Image"]],
|
||||
max_level=max_level,
|
||||
)
|
||||
expected_df = DataFrame(data=expected, columns=result.columns.values)
|
||||
tm.assert_equal(expected_df, result)
|
||||
|
||||
def test_nested_flattening_consistent(self):
|
||||
# see gh-21537
|
||||
df1 = json_normalize([{"A": {"B": 1}}])
|
||||
df2 = json_normalize({"dummy": [{"A": {"B": 1}}]}, "dummy")
|
||||
|
||||
# They should be the same.
|
||||
tm.assert_frame_equal(df1, df2)
|
||||
|
||||
def test_nonetype_record_path(self, nulls_fixture):
|
||||
# see gh-30148
|
||||
# should not raise TypeError
|
||||
result = json_normalize(
|
||||
[
|
||||
{"state": "Texas", "info": nulls_fixture},
|
||||
{"state": "Florida", "info": [{"i": 2}]},
|
||||
],
|
||||
record_path=["info"],
|
||||
)
|
||||
expected = DataFrame({"i": 2}, index=[0])
|
||||
tm.assert_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("value", ["false", "true", "{}", "1", '"text"'])
|
||||
def test_non_list_record_path_errors(self, value):
|
||||
# see gh-30148, GH 26284
|
||||
parsed_value = json.loads(value)
|
||||
test_input = {"state": "Texas", "info": parsed_value}
|
||||
test_path = "info"
|
||||
msg = (
|
||||
f"{test_input} has non list value {parsed_value} for path {test_path}. "
|
||||
"Must be list or null."
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
json_normalize([test_input], record_path=[test_path])
|
||||
|
||||
def test_meta_non_iterable(self):
|
||||
# GH 31507
|
||||
data = """[{"id": 99, "data": [{"one": 1, "two": 2}]}]"""
|
||||
|
||||
result = json_normalize(json.loads(data), record_path=["data"], meta=["id"])
|
||||
expected = DataFrame(
|
||||
{"one": [1], "two": [2], "id": np.array([99], dtype=object)}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_generator(self, state_data):
|
||||
# GH35923 Fix pd.json_normalize to not skip the first element of a
|
||||
# generator input
|
||||
def generator_data():
|
||||
yield from state_data[0]["counties"]
|
||||
|
||||
result = json_normalize(generator_data())
|
||||
expected = DataFrame(state_data[0]["counties"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_top_column_with_leading_underscore(self):
|
||||
# 49861
|
||||
data = {"_id": {"a1": 10, "l2": {"l3": 0}}, "gg": 4}
|
||||
result = json_normalize(data, sep="_")
|
||||
expected = DataFrame([[4, 10, 0]], columns=["gg", "_id_a1", "_id_l2_l3"])
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
class TestNestedToRecord:
|
||||
def test_flat_stays_flat(self):
|
||||
recs = [{"flat1": 1, "flat2": 2}, {"flat3": 3, "flat2": 4}]
|
||||
result = nested_to_record(recs)
|
||||
expected = recs
|
||||
assert result == expected
|
||||
|
||||
def test_one_level_deep_flattens(self):
|
||||
data = {"flat1": 1, "dict1": {"c": 1, "d": 2}}
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {"dict1.c": 1, "dict1.d": 2, "flat1": 1}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nested_flattens(self):
|
||||
data = {
|
||||
"flat1": 1,
|
||||
"dict1": {"c": 1, "d": 2},
|
||||
"nested": {"e": {"c": 1, "d": 2}, "d": 2},
|
||||
}
|
||||
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"dict1.c": 1,
|
||||
"dict1.d": 2,
|
||||
"flat1": 1,
|
||||
"nested.d": 2,
|
||||
"nested.e.c": 1,
|
||||
"nested.e.d": 2,
|
||||
}
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_json_normalize_errors(self, missing_metadata):
|
||||
# GH14583:
|
||||
# If meta keys are not always present a new option to set
|
||||
# errors='ignore' has been implemented
|
||||
|
||||
msg = (
|
||||
"Key 'name' not found. To replace missing values of "
|
||||
"'name' with np.nan, pass in errors='ignore'"
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path="addresses",
|
||||
meta="name",
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta(self, missing_metadata):
|
||||
# GH25468
|
||||
# If metadata is nullable with errors set to ignore, the null values
|
||||
# should be numpy.nan values
|
||||
result = json_normalize(
|
||||
data=missing_metadata, record_path="addresses", meta="name", errors="ignore"
|
||||
)
|
||||
ex_data = [
|
||||
[9562, "Morris St.", "Massillon", "OH", 44646, "Alice"],
|
||||
[8449, "Spring St.", "Elizabethton", "TN", 37643, np.nan],
|
||||
]
|
||||
columns = ["number", "street", "city", "state", "zip", "name"]
|
||||
expected = DataFrame(ex_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_missing_nested_meta(self):
|
||||
# GH44312
|
||||
# If errors="ignore" and nested metadata is null, we should return nan
|
||||
data = {"meta": "foo", "nested_meta": None, "value": [{"rec": 1}, {"rec": 2}]}
|
||||
result = json_normalize(
|
||||
data,
|
||||
record_path="value",
|
||||
meta=["meta", ["nested_meta", "leaf"]],
|
||||
errors="ignore",
|
||||
)
|
||||
ex_data = [[1, "foo", np.nan], [2, "foo", np.nan]]
|
||||
columns = ["rec", "meta", "nested_meta.leaf"]
|
||||
expected = DataFrame(ex_data, columns=columns).astype(
|
||||
{"nested_meta.leaf": object}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# If errors="raise" and nested metadata is null, we should raise with the
|
||||
# key of the first missing level
|
||||
with pytest.raises(KeyError, match="'leaf' not found"):
|
||||
json_normalize(
|
||||
data,
|
||||
record_path="value",
|
||||
meta=["meta", ["nested_meta", "leaf"]],
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta_multilevel_record_path_errors_raise(self, missing_metadata):
|
||||
# GH41876
|
||||
# Ensure errors='raise' works as intended even when a record_path of length
|
||||
# greater than one is passed in
|
||||
msg = (
|
||||
"Key 'name' not found. To replace missing values of "
|
||||
"'name' with np.nan, pass in errors='ignore'"
|
||||
)
|
||||
with pytest.raises(KeyError, match=msg):
|
||||
json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path=["previous_residences", "cities"],
|
||||
meta="name",
|
||||
errors="raise",
|
||||
)
|
||||
|
||||
def test_missing_meta_multilevel_record_path_errors_ignore(self, missing_metadata):
|
||||
# GH41876
|
||||
# Ensure errors='ignore' works as intended even when a record_path of length
|
||||
# greater than one is passed in
|
||||
result = json_normalize(
|
||||
data=missing_metadata,
|
||||
record_path=["previous_residences", "cities"],
|
||||
meta="name",
|
||||
errors="ignore",
|
||||
)
|
||||
ex_data = [
|
||||
["Foo York City", "Alice"],
|
||||
["Barmingham", np.nan],
|
||||
]
|
||||
columns = ["city_name", "name"]
|
||||
expected = DataFrame(ex_data, columns=columns)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_donot_drop_nonevalues(self):
|
||||
# GH21356
|
||||
data = [
|
||||
{"info": None, "author_name": {"first": "Smith", "last_name": "Appleseed"}},
|
||||
{
|
||||
"info": {"created_at": "11/08/1993", "last_updated": "26/05/2012"},
|
||||
"author_name": {"first": "Jane", "last_name": "Doe"},
|
||||
},
|
||||
]
|
||||
result = nested_to_record(data)
|
||||
expected = [
|
||||
{
|
||||
"info": None,
|
||||
"author_name.first": "Smith",
|
||||
"author_name.last_name": "Appleseed",
|
||||
},
|
||||
{
|
||||
"author_name.first": "Jane",
|
||||
"author_name.last_name": "Doe",
|
||||
"info.created_at": "11/08/1993",
|
||||
"info.last_updated": "26/05/2012",
|
||||
},
|
||||
]
|
||||
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_top_level_bottom_level(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it does not do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"country": {
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"id": None,
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
def test_nonetype_multiple_levels(self):
|
||||
# GH21158: If inner level json has a key with a null value
|
||||
# make sure it does not do a new_d.pop twice and except
|
||||
data = {
|
||||
"id": None,
|
||||
"location": {
|
||||
"id": None,
|
||||
"country": {
|
||||
"id": None,
|
||||
"state": {
|
||||
"id": None,
|
||||
"town.info": {
|
||||
"region": None,
|
||||
"x": 49.151580810546875,
|
||||
"y": -33.148521423339844,
|
||||
"z": 27.572303771972656,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
result = nested_to_record(data)
|
||||
expected = {
|
||||
"id": None,
|
||||
"location.id": None,
|
||||
"location.country.id": None,
|
||||
"location.country.state.id": None,
|
||||
"location.country.state.town.info.region": None,
|
||||
"location.country.state.town.info.x": 49.151580810546875,
|
||||
"location.country.state.town.info.y": -33.148521423339844,
|
||||
"location.country.state.town.info.z": 27.572303771972656,
|
||||
}
|
||||
assert result == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_level, expected",
|
||||
[
|
||||
(
|
||||
None,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField.Id": "ID001",
|
||||
"Lookup.UserField.Name": "Name001",
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
{
|
||||
"CreatedBy": {"Name": "User001"},
|
||||
"Lookup": {
|
||||
"TextField": "Some text",
|
||||
"UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
},
|
||||
"Image": {"a": "b"},
|
||||
}
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
{
|
||||
"CreatedBy.Name": "User001",
|
||||
"Lookup.TextField": "Some text",
|
||||
"Lookup.UserField": {"Id": "ID001", "Name": "Name001"},
|
||||
"Image.a": "b",
|
||||
}
|
||||
],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_with_max_level(self, max_level, expected, max_level_test_input_data):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
output = nested_to_record(max_level_test_input_data, max_level=max_level)
|
||||
assert output == expected
|
||||
|
||||
def test_with_large_max_level(self):
|
||||
# GH23843: Enhanced JSON normalize
|
||||
max_level = 100
|
||||
input_data = [
|
||||
{
|
||||
"CreatedBy": {
|
||||
"user": {
|
||||
"name": {"firstname": "Leo", "LastName": "Thomson"},
|
||||
"family_tree": {
|
||||
"father": {
|
||||
"name": "Father001",
|
||||
"father": {
|
||||
"Name": "Father002",
|
||||
"father": {
|
||||
"name": "Father003",
|
||||
"father": {"Name": "Father004"},
|
||||
},
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
expected = [
|
||||
{
|
||||
"CreatedBy.user.name.firstname": "Leo",
|
||||
"CreatedBy.user.name.LastName": "Thomson",
|
||||
"CreatedBy.user.family_tree.father.name": "Father001",
|
||||
"CreatedBy.user.family_tree.father.father.Name": "Father002",
|
||||
"CreatedBy.user.family_tree.father.father.father.name": "Father003",
|
||||
"CreatedBy.user.family_tree.father.father.father.father.Name": "Father004", # noqa: E501
|
||||
}
|
||||
]
|
||||
output = nested_to_record(input_data, max_level=max_level)
|
||||
assert output == expected
|
||||
|
||||
def test_series_non_zero_index(self):
|
||||
# GH 19020
|
||||
data = {
|
||||
0: {"id": 1, "name": "Foo", "elements": {"a": 1}},
|
||||
1: {"id": 2, "name": "Bar", "elements": {"b": 2}},
|
||||
2: {"id": 3, "name": "Baz", "elements": {"c": 3}},
|
||||
}
|
||||
s = Series(data)
|
||||
s.index = [1, 2, 3]
|
||||
result = json_normalize(s)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"id": [1, 2, 3],
|
||||
"name": ["Foo", "Bar", "Baz"],
|
||||
"elements.a": [1.0, np.nan, np.nan],
|
||||
"elements.b": [np.nan, 2.0, np.nan],
|
||||
"elements.c": [np.nan, np.nan, 3.0],
|
||||
}
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,543 @@
|
||||
from collections.abc import Iterator
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
read_json,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.json._json import JsonReader
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings(
|
||||
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lines_json_df():
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
return df.to_json(lines=True, orient="records")
|
||||
|
||||
|
||||
@pytest.fixture(params=["ujson", "pyarrow"])
|
||||
def engine(request):
|
||||
if request.param == "pyarrow":
|
||||
pytest.importorskip("pyarrow.json")
|
||||
return request.param
|
||||
|
||||
|
||||
def test_read_jsonl():
|
||||
# GH9180
|
||||
result = read_json(StringIO('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n'), lines=True)
|
||||
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_engine_pyarrow(datapath, engine):
|
||||
result = read_json(
|
||||
datapath("io", "json", "data", "line_delimited.json"),
|
||||
lines=True,
|
||||
engine=engine,
|
||||
)
|
||||
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_datetime(request, engine):
|
||||
# GH33787
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = "Pyarrow only supports a file path as an input and line delimited json"
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
df = DataFrame(
|
||||
[([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
|
||||
columns=["accounts", "date", "name"],
|
||||
)
|
||||
json_line = df.to_json(lines=True, orient="records")
|
||||
|
||||
if engine == "pyarrow":
|
||||
result = read_json(StringIO(json_line), engine=engine)
|
||||
else:
|
||||
result = read_json(StringIO(json_line), engine=engine)
|
||||
expected = DataFrame(
|
||||
[[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
|
||||
columns=["accounts", "date", "name"],
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_read_jsonl_unicode_chars():
|
||||
# GH15132: non-ascii unicode characters
|
||||
# \u201d == RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
# simulate file handle
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
json = StringIO(json)
|
||||
result = read_json(json, lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# simulate string
|
||||
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
|
||||
result = read_json(StringIO(json), lines=True)
|
||||
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_jsonl():
|
||||
# GH9180
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
|
||||
assert result == expected
|
||||
|
||||
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
|
||||
assert result == expected
|
||||
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
|
||||
|
||||
# GH15096: escaped characters in columns and data
|
||||
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
|
||||
result = df.to_json(orient="records", lines=True)
|
||||
expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
|
||||
assert result == expected
|
||||
tm.assert_frame_equal(read_json(StringIO(result), lines=True), df)
|
||||
|
||||
|
||||
def test_to_jsonl_count_new_lines():
|
||||
# GH36888
|
||||
df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
|
||||
actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
|
||||
expected_new_lines_count = 2
|
||||
assert actual_new_lines_count == expected_new_lines_count
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [1, 1.0])
|
||||
def test_readjson_chunks(request, lines_json_df, chunksize, engine):
|
||||
# Basic test that read_json(chunks=True) gives the same result as
|
||||
# read_json(chunks=False)
|
||||
# GH17048: memory usage when lines=True
|
||||
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
unchunked = read_json(StringIO(lines_json_df), lines=True)
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
tm.assert_frame_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_chunksize_requires_lines(lines_json_df, engine):
|
||||
msg = "chunksize can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
|
||||
) as _:
|
||||
pass
|
||||
|
||||
|
||||
def test_readjson_chunks_series(request, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason))
|
||||
|
||||
# Test reading line-format JSON to Series with chunksize param
|
||||
s = pd.Series({"A": 1, "B": 2})
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
|
||||
|
||||
strio = StringIO(s.to_json(lines=True, orient="records"))
|
||||
with read_json(
|
||||
strio, lines=True, typ="Series", chunksize=1, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
|
||||
tm.assert_series_equal(chunked, unchunked)
|
||||
|
||||
|
||||
def test_readjson_each_chunk(request, lines_json_df, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
# Other tests check that the final result of read_json(chunksize=True)
|
||||
# is correct. This checks the intermediate chunks.
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
|
||||
) as reader:
|
||||
chunks = list(reader)
|
||||
assert chunks[0].shape == (2, 2)
|
||||
assert chunks[1].shape == (1, 2)
|
||||
|
||||
|
||||
def test_readjson_chunks_from_file(request, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
unchunked = read_json(path, lines=True, engine=engine)
|
||||
tm.assert_frame_equal(unchunked, chunked)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1])
|
||||
def test_readjson_chunks_closes(chunksize):
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
df.to_json(path, lines=True, orient="records")
|
||||
reader = JsonReader(
|
||||
path,
|
||||
orient=None,
|
||||
typ="frame",
|
||||
dtype=True,
|
||||
convert_axes=True,
|
||||
convert_dates=True,
|
||||
keep_default_dates=True,
|
||||
precise_float=False,
|
||||
date_unit=None,
|
||||
encoding=None,
|
||||
lines=True,
|
||||
chunksize=chunksize,
|
||||
compression=None,
|
||||
nrows=None,
|
||||
)
|
||||
with reader:
|
||||
reader.read()
|
||||
assert (
|
||||
reader.handles.handle.closed
|
||||
), f"didn't close stream with chunksize = {chunksize}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
|
||||
def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
|
||||
msg = r"'chunksize' must be an integer >=1"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
with read_json(
|
||||
StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
|
||||
) as _:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [None, 1, 2])
|
||||
def test_readjson_chunks_multiple_empty_lines(chunksize):
|
||||
j = """
|
||||
|
||||
{"A":1,"B":4}
|
||||
|
||||
|
||||
|
||||
{"A":2,"B":5}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{"A":3,"B":6}
|
||||
"""
|
||||
orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
|
||||
test = read_json(StringIO(j), lines=True, chunksize=chunksize)
|
||||
if chunksize is not None:
|
||||
with test:
|
||||
test = pd.concat(test)
|
||||
tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
|
||||
|
||||
|
||||
def test_readjson_unicode(request, monkeypatch, engine):
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
|
||||
|
||||
result = read_json(path, engine=engine)
|
||||
expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows", [1, 2])
|
||||
def test_readjson_nrows(nrows, engine):
|
||||
# GH 33916
|
||||
# Test reading line-format JSON to Series with nrows param
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
result = read_json(StringIO(jsonl), lines=True, nrows=nrows)
|
||||
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
|
||||
def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
|
||||
# GH 33916
|
||||
# Test reading line-format JSON to Series with nrows and chunksize param
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
|
||||
if engine != "pyarrow":
|
||||
with read_json(
|
||||
StringIO(jsonl), lines=True, nrows=nrows, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
else:
|
||||
with read_json(
|
||||
jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
|
||||
) as reader:
|
||||
chunked = pd.concat(reader)
|
||||
expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
|
||||
tm.assert_frame_equal(chunked, expected)
|
||||
|
||||
|
||||
def test_readjson_nrows_requires_lines(engine):
|
||||
# GH 33916
|
||||
# Test ValueError raised if nrows is set without setting lines in read_json
|
||||
jsonl = """{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}"""
|
||||
msg = "nrows can only be passed if lines=True"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_json(jsonl, lines=False, nrows=2, engine=engine)
|
||||
|
||||
|
||||
def test_readjson_lines_chunks_fileurl(request, datapath, engine):
|
||||
# GH 27135
|
||||
# Test reading line-format JSON from file url
|
||||
if engine == "pyarrow":
|
||||
# GH 48893
|
||||
reason = (
|
||||
"Pyarrow only supports a file path as an input and line delimited json"
|
||||
"and doesn't support chunksize parameter."
|
||||
)
|
||||
request.applymarker(pytest.mark.xfail(reason=reason, raises=ValueError))
|
||||
|
||||
df_list_expected = [
|
||||
DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
|
||||
DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
|
||||
DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
|
||||
]
|
||||
os_path = datapath("io", "json", "data", "line_delimited.json")
|
||||
file_url = Path(os_path).as_uri()
|
||||
with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
|
||||
for index, chuck in enumerate(url_reader):
|
||||
tm.assert_frame_equal(chuck, df_list_expected[index])
|
||||
|
||||
|
||||
def test_chunksize_is_incremental():
|
||||
# See https://github.com/pandas-dev/pandas/issues/34548
|
||||
jsonl = (
|
||||
"""{"a": 1, "b": 2}
|
||||
{"a": 3, "b": 4}
|
||||
{"a": 5, "b": 6}
|
||||
{"a": 7, "b": 8}\n"""
|
||||
* 1000
|
||||
)
|
||||
|
||||
class MyReader:
|
||||
def __init__(self, contents) -> None:
|
||||
self.read_count = 0
|
||||
self.stringio = StringIO(contents)
|
||||
|
||||
def read(self, *args):
|
||||
self.read_count += 1
|
||||
return self.stringio.read(*args)
|
||||
|
||||
def __iter__(self) -> Iterator:
|
||||
self.read_count += 1
|
||||
return iter(self.stringio)
|
||||
|
||||
reader = MyReader(jsonl)
|
||||
assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
|
||||
assert reader.read_count > 10
|
||||
|
||||
|
||||
@pytest.mark.parametrize("orient_", ["split", "index", "table"])
|
||||
def test_to_json_append_orient(orient_):
|
||||
# GH 35849
|
||||
# Test ValueError when orient is not 'records'
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
r"mode='a' \(append\) is only supported when "
|
||||
"lines is True and orient is 'records'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode="a", orient=orient_)
|
||||
|
||||
|
||||
def test_to_json_append_lines():
|
||||
# GH 35849
|
||||
# Test ValueError when lines is not True
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
r"mode='a' \(append\) is only supported when "
|
||||
"lines is True and orient is 'records'"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode="a", lines=False, orient="records")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode_", ["r", "x"])
|
||||
def test_to_json_append_mode(mode_):
|
||||
# GH 35849
|
||||
# Test ValueError when mode is not supported option
|
||||
df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
msg = (
|
||||
f"mode={mode_} is not a valid option."
|
||||
"Only 'w' and 'a' are currently supported."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_json(mode=mode_, lines=False, orient="records")
|
||||
|
||||
|
||||
def test_to_json_append_output_consistent_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing same columns, new rows
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
|
||||
expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_inconsistent_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing one new column, one old column, new rows
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, None, None],
|
||||
"col2": ["a", "b", "e", "f"],
|
||||
"col3": [np.nan, np.nan, "!", "#"],
|
||||
}
|
||||
)
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_different_columns():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing same, differing and new columns
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
df4 = DataFrame({"col4": [True, False]})
|
||||
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col1": [1, 2, 3, 4, None, None, None, None],
|
||||
"col2": ["a", "b", "c", "d", "e", "f", np.nan, np.nan],
|
||||
"col3": [np.nan, np.nan, np.nan, np.nan, "!", "#", np.nan, np.nan],
|
||||
"col4": [None, None, None, None, None, None, True, False],
|
||||
}
|
||||
).astype({"col4": "float"})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
df4.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_to_json_append_output_different_columns_reordered():
|
||||
# GH 35849
|
||||
# Testing that resulting output reads in as expected.
|
||||
# Testing specific result column order.
|
||||
df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
|
||||
df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
|
||||
df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
|
||||
df4 = DataFrame({"col4": [True, False]})
|
||||
|
||||
# df4, df3, df2, df1 (in that order)
|
||||
expected = DataFrame(
|
||||
{
|
||||
"col4": [True, False, None, None, None, None, None, None],
|
||||
"col2": [np.nan, np.nan, "e", "f", "c", "d", "a", "b"],
|
||||
"col3": [np.nan, np.nan, "!", "#", np.nan, np.nan, np.nan, np.nan],
|
||||
"col1": [None, None, None, None, 3, 4, 1, 2],
|
||||
}
|
||||
).astype({"col4": "float"})
|
||||
with tm.ensure_clean("test.json") as path:
|
||||
# Save dataframes to the same file
|
||||
df4.to_json(path, mode="a", lines=True, orient="records")
|
||||
df3.to_json(path, mode="a", lines=True, orient="records")
|
||||
df2.to_json(path, mode="a", lines=True, orient="records")
|
||||
df1.to_json(path, mode="a", lines=True, orient="records")
|
||||
|
||||
# Read path file
|
||||
result = read_json(path, lines=True)
|
||||
tm.assert_frame_equal(result, expected)
|
1087
venv/lib/python3.12/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
1087
venv/lib/python3.12/site-packages/pandas/tests/io/json/test_ujson.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user