diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ed4348d25f606..b6d5493aefaa9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -264,6 +264,8 @@ I/O - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`) - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`). +- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`) + Period ^^^^^^ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 3ed0b5851b395..40aeee67ce2da 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -1,7 +1,7 @@ # --------------------------------------------------------------------- # JSON normalization routines -from collections import defaultdict +from collections import abc, defaultdict import copy from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union @@ -261,10 +261,15 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List: if isinstance(data, list) and not data: return DataFrame() - - # A bit of a hackjob - if isinstance(data, dict): + elif isinstance(data, dict): + # A bit of a hackjob data = [data] + elif isinstance(data, abc.Iterable) and not isinstance(data, str): + # GH35923 Fix pd.json_normalize to not skip the first element of a + # generator input + data = list(data) + else: + raise NotImplementedError if record_path is None: if any([isinstance(x, dict) for x in y.values()] for y in data): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 244302e34337d..46f6367a7227f 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -1,3 +1,4 @@ +from contextlib import nullcontext as does_not_raise import json import numpy as np @@ -168,6 +169,22 @@ def test_empty_array(self): expected = DataFrame() tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "data, record_path, error", + [ + ([{"a": 0}, {"a": 1}], None, does_not_raise()), + ({"a": [{"a": 0}, {"a": 1}]}, "a", does_not_raise()), + ('{"a": [{"a": 0}, {"a": 1}]}', None, pytest.raises(NotImplementedError)), + (None, None, pytest.raises(NotImplementedError)), + ], + ) + def test_accepted_input(self, data, record_path, error): + with error: + result = json_normalize(data, record_path=record_path) + expected = DataFrame([0, 1], columns=["a"]) + + tm.assert_frame_equal(result, expected) + def test_simple_normalize_with_separator(self, deep_nested): # GH 14883 result = json_normalize({"A": {"A": 1, "B": 2}}) @@ -518,6 +535,17 @@ def test_meta_non_iterable(self): ) tm.assert_frame_equal(result, expected) + def test_generator(self, state_data): + # GH35923 Fix pd.json_normalize to not skip the first element of a + # generator input + def generator_data(): + yield from state_data[0]["counties"] + + result = json_normalize(generator_data()) + expected = DataFrame(state_data[0]["counties"]) + + tm.assert_frame_equal(result, expected) + class TestNestedToRecord: def test_flat_stays_flat(self):