Skip to content

Commit

Permalink
refactor deserialize and add in check for duplicate columns
Browse files Browse the repository at this point in the history
  • Loading branch information
bmoscon authored Nov 3, 2016
1 parent 66c23db commit 5794407
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 14 deletions.
44 changes: 30 additions & 14 deletions arctic/serialization/numpy_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,24 +165,40 @@ def serialize(self, df):
return ret

def deserialize(self, data, columns=None):
'''
Deserializes SON to a DataFrame
Parameters
----------
data: SON data
columns: None, or list of strings
optionally you can deserialize a subset of the data in the SON. Index
columns are ALWAYS deserialized, and should not be specified
Returns
-------
pandas dataframe or series
'''
if data == []:
return pd.DataFrame()

if isinstance(data, list):
if columns and INDEX in data[0][METADATA]:
columns.extend(data[0][METADATA][INDEX])
df = pd.concat([self.converter.objify(d, columns) for d in data])
else:
df = pd.concat([self.converter.objify(d, columns) for d in data], ignore_index=True)
dtype = data[0][METADATA][TYPE]
if INDEX in data[0][METADATA]:
df = df.set_index(data[0][METADATA][INDEX])
else:
meta = data[0][METADATA] if isinstance(data, list) else data[METADATA]
index = INDEX in meta

if columns:
if index:
columns.extend(meta[INDEX])
if len(columns) > len(set(columns)):
raise Exception("Duplicate columns specified, cannot de-serialize")

if not isinstance(data, list):
df = self.converter.objify(data, columns)
dtype = data[METADATA][TYPE]
if INDEX in data[METADATA]:
df = df.set_index(data[METADATA][INDEX])
if dtype == 'series':
else:
df = pd.concat([self.converter.objify(d, columns) for d in data], ignore_index=not index)

if index:
df = df.set_index(meta[INDEX])
if meta[TYPE] == 'series':
return df[df.columns[0]]
return df

Expand Down
11 changes: 11 additions & 0 deletions tests/unit/serialization/test_numpy_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,14 @@ def test_string_cols_with_nans():
df = pd.DataFrame(data={'one': ['a', 'b', 'c', np.NaN]})

assert(df.equals(f.objify(f.docify(df))))


def test_multi_column_fail():
df = pd.DataFrame(data={'A': [1, 2, 3], 'B': [2, 3, 4], 'C': [3, 4, 5]})
df = df.set_index(['A'])
n = FrametoArraySerializer()
a = n.serialize(df)

with pytest.raises(Exception) as e:
n.deserialize(a, columns=['A', 'B'])
assert('Duplicate' in str(e))

0 comments on commit 5794407

Please sign in to comment.