BUG: Fix pd.json_normalize to not skip the first element of a generat…

…or input (#38698)
pandas-dev · Dec 30, 2020 · 52bdfdc · 52bdfdc
1 parent 94810d1
commit 52bdfdc
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -264,6 +264,8 @@ I/O
 - Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
 - Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply
   for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
+- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
+
 
 Period
 ^^^^^^

diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -1,7 +1,7 @@
 # ---------------------------------------------------------------------
 # JSON normalization routines
 
-from collections import defaultdict
+from collections import abc, defaultdict
 import copy
 from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union
 
@@ -261,10 +261,15 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
 
     if isinstance(data, list) and not data:
         return DataFrame()
-
-    # A bit of a hackjob
-    if isinstance(data, dict):
+    elif isinstance(data, dict):
+        # A bit of a hackjob
         data = [data]
+    elif isinstance(data, abc.Iterable) and not isinstance(data, str):
+        # GH35923 Fix pd.json_normalize to not skip the first element of a
+        # generator input
+        data = list(data)
+    else:
+        raise NotImplementedError
 
     if record_path is None:
         if any([isinstance(x, dict) for x in y.values()] for y in data):

diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -1,3 +1,4 @@
+from contextlib import nullcontext as does_not_raise
 import json
 
 import numpy as np
@@ -168,6 +169,22 @@ def test_empty_array(self):
         expected = DataFrame()
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "data, record_path, error",
+        [
+            ([{"a": 0}, {"a": 1}], None, does_not_raise()),
+            ({"a": [{"a": 0}, {"a": 1}]}, "a", does_not_raise()),
+            ('{"a": [{"a": 0}, {"a": 1}]}', None, pytest.raises(NotImplementedError)),
+            (None, None, pytest.raises(NotImplementedError)),
+        ],
+    )
+    def test_accepted_input(self, data, record_path, error):
+        with error:
+            result = json_normalize(data, record_path=record_path)
+            expected = DataFrame([0, 1], columns=["a"])
+
+            tm.assert_frame_equal(result, expected)
+
     def test_simple_normalize_with_separator(self, deep_nested):
         # GH 14883
         result = json_normalize({"A": {"A": 1, "B": 2}})
@@ -518,6 +535,17 @@ def test_meta_non_iterable(self):
         )
         tm.assert_frame_equal(result, expected)
 
+    def test_generator(self, state_data):
+        # GH35923 Fix pd.json_normalize to not skip the first element of a
+        # generator input
+        def generator_data():
+            yield from state_data[0]["counties"]
+
+        result = json_normalize(generator_data())
+        expected = DataFrame(state_data[0]["counties"])
+
+        tm.assert_frame_equal(result, expected)
+
 
 class TestNestedToRecord:
     def test_flat_stays_flat(self):