Skip to content

Commit

Permalink
Json normalize nan support (#25619)
Browse files Browse the repository at this point in the history
  • Loading branch information
antoineviscardi authored and jreback committed Mar 13, 2019
1 parent 69ae24b commit dcf7fce
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 57 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,10 @@ I/O
- Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`)
- Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`)
- Bug in :func:`read_json` for ``orient='table'`` and string of float column names, as it makes a column name type conversion to Timestamp, which is not applicable because column names are already defined in the JSON schema (:issue:`25435`)
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
-
-


Plotting
Expand Down
3 changes: 2 additions & 1 deletion pandas/io/json/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ def _recursive_extract(data, path, seen_meta, level=0):
raise ValueError('Conflicting metadata name {name}, '
'need distinguishing prefix '.format(name=k))

result[k] = np.array(v).repeat(lengths)
# forcing dtype to object to avoid the metadata being casted to string
result[k] = np.array(v, dtype=object).repeat(lengths)

return result
114 changes: 59 additions & 55 deletions pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ def author_missing_data():
}]


@pytest.fixture
def missing_metadata():
return [
{'name': 'Alice',
'addresses': [{'number': 9562,
'street': 'Morris St.',
'city': 'Massillon',
'state': 'OH',
'zip': 44646}]
},
{'addresses': [{'number': 8449,
'street': 'Spring St.',
'city': 'Elizabethton',
'state': 'TN',
'zip': 37643}]
}
]


class TestJSONNormalize(object):

def test_simple_records(self):
Expand Down Expand Up @@ -318,66 +337,51 @@ def test_nested_flattens(self):

assert result == expected

def test_json_normalize_errors(self):
# GH14583: If meta keys are not always present
# a new option to set errors='ignore' has been implemented
i = {
"Trades": [{
"general": {
"tradeid": 100,
"trade_version": 1,
"stocks": [{

"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}, {
"general": {
"tradeid": 100,
"stocks": [{
"symbol": "AAPL",
"name": "Apple",
"price": "0"
}, {
"symbol": "GOOG",
"name": "Google",
"price": "0"
}
]
}
}
]
}
j = json_normalize(data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
errors='ignore')
expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
'price': {0: '0', 1: '0', 2: '0', 3: '0'},
'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}

assert j.fillna('').to_dict() == expected

msg = ("Try running with errors='ignore' as key 'trade_version'"
def test_json_normalize_errors(self, missing_metadata):
# GH14583:
# If meta keys are not always present a new option to set
# errors='ignore' has been implemented

msg = ("Try running with errors='ignore' as key 'name'"
" is not always present")
with pytest.raises(KeyError, match=msg):
json_normalize(
data=i['Trades'],
record_path=[['general', 'stocks']],
meta=[['general', 'tradeid'],
['general', 'trade_version']],
data=missing_metadata,
record_path='addresses',
meta='name',
errors='raise')

def test_missing_meta(self, missing_metadata):
# GH25468
# If metadata is nullable with errors set to ignore, the null values
# should be numpy.nan values
result = json_normalize(
data=missing_metadata,
record_path='addresses',
meta='name',
errors='ignore')
ex_data = [
{'city': 'Massillon',
'number': 9562,
'state': 'OH',
'street': 'Morris St.',
'zip': 44646,
'name': 'Alice'},
{'city': 'Elizabethton',
'number': 8449,
'state': 'TN',
'street': 'Spring St.',
'zip': 37643,
'name': np.nan}
]
ex_data = [
['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'],
['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan]
]
columns = ['city', 'number', 'state', 'street', 'zip', 'name']
expected = DataFrame(ex_data, columns=columns)
tm.assert_frame_equal(result, expected)

def test_donot_drop_nonevalues(self):
# GH21356
data = [
Expand Down

0 comments on commit dcf7fce

Please sign in to comment.