Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix pd.json_normalize to not skip the first element of a generator input #38698

Merged
merged 11 commits into from
Dec 30, 2020
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,8 @@ I/O
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)


Period
^^^^^^
Expand Down
13 changes: 9 additions & 4 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ---------------------------------------------------------------------
# JSON normalization routines

from collections import defaultdict
from collections import abc, defaultdict
import copy
from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union

Expand Down Expand Up @@ -261,10 +261,15 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:

if isinstance(data, list) and not data:
return DataFrame()

# A bit of a hackjob
if isinstance(data, dict):
elif isinstance(data, dict):
# A bit of a hackjob
jreback marked this conversation as resolved.
Show resolved Hide resolved
data = [data]
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
data = list(data)
else:
raise NotImplementedError

if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from contextlib import nullcontext as does_not_raise
import json

import numpy as np
Expand Down Expand Up @@ -168,6 +169,22 @@ def test_empty_array(self):
expected = DataFrame()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"data, record_path, error",
[
([{"a": 0}, {"a": 1}], None, does_not_raise()),
({"a": [{"a": 0}, {"a": 1}]}, "a", does_not_raise()),
('{"a": [{"a": 0}, {"a": 1}]}', None, pytest.raises(NotImplementedError)),
(None, None, pytest.raises(NotImplementedError)),
],
)
def test_accepted_input(self, data, record_path, error):
with error:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])

tm.assert_frame_equal(result, expected)

def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({"A": {"A": 1, "B": 2}})
Expand Down Expand Up @@ -518,6 +535,17 @@ def test_meta_non_iterable(self):
)
tm.assert_frame_equal(result, expected)

def test_generator(self, state_data):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
def generator_data():
yield from state_data[0]["counties"]

result = json_normalize(generator_data())
expected = DataFrame(state_data[0]["counties"])

tm.assert_frame_equal(result, expected)


class TestNestedToRecord:
def test_flat_stays_flat(self):
Expand Down