Skip to content

Commit

Permalink
BUG: Fix pd.json_normalize to not skip the first element of a generat…
Browse files Browse the repository at this point in the history
…or input (#38698)
  • Loading branch information
avinashpancham authored Dec 30, 2020
1 parent 94810d1 commit 52bdfdc
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 4 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ I/O
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)


Period
^^^^^^
Expand Down
13 changes: 9 additions & 4 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ---------------------------------------------------------------------
# JSON normalization routines

from collections import defaultdict
from collections import abc, defaultdict
import copy
from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union

Expand Down Expand Up @@ -261,10 +261,15 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:

if isinstance(data, list) and not data:
return DataFrame()

# A bit of a hackjob
if isinstance(data, dict):
elif isinstance(data, dict):
# A bit of a hackjob
data = [data]
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
data = list(data)
else:
raise NotImplementedError

if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from contextlib import nullcontext as does_not_raise
import json

import numpy as np
Expand Down Expand Up @@ -168,6 +169,22 @@ def test_empty_array(self):
expected = DataFrame()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"data, record_path, error",
[
([{"a": 0}, {"a": 1}], None, does_not_raise()),
({"a": [{"a": 0}, {"a": 1}]}, "a", does_not_raise()),
('{"a": [{"a": 0}, {"a": 1}]}', None, pytest.raises(NotImplementedError)),
(None, None, pytest.raises(NotImplementedError)),
],
)
def test_accepted_input(self, data, record_path, error):
with error:
result = json_normalize(data, record_path=record_path)
expected = DataFrame([0, 1], columns=["a"])

tm.assert_frame_equal(result, expected)

def test_simple_normalize_with_separator(self, deep_nested):
# GH 14883
result = json_normalize({"A": {"A": 1, "B": 2}})
Expand Down Expand Up @@ -518,6 +535,17 @@ def test_meta_non_iterable(self):
)
tm.assert_frame_equal(result, expected)

def test_generator(self, state_data):
# GH35923 Fix pd.json_normalize to not skip the first element of a
# generator input
def generator_data():
yield from state_data[0]["counties"]

result = json_normalize(generator_data())
expected = DataFrame(state_data[0]["counties"])

tm.assert_frame_equal(result, expected)


class TestNestedToRecord:
def test_flat_stays_flat(self):
Expand Down

0 comments on commit 52bdfdc

Please sign in to comment.