Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fixes incorrect definition of layout for SeriesType
Browse files Browse the repository at this point in the history
Details: definition of underlying data type of Series was
done from PyObject dtype only and didn't take into account
layout of original array, as a result 'C' layout was always
inferred, where the original array might have other layout,
breaking iteration over such Series (DF columns).

Fixes #996.
  • Loading branch information
kozlov-alexey committed Dec 21, 2021
1 parent 1ebf55c commit b55020d
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 15 deletions.
33 changes: 24 additions & 9 deletions sdc/hiframes/boxing.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical
from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical
from sdc.hiframes.pd_series_ext import SeriesType
from sdc.hiframes.pd_series_type import _get_series_array_type
from sdc.hiframes.pd_dataframe_ext import get_structure_maps
from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types

Expand All @@ -70,7 +69,7 @@ def typeof_pd_dataframe(val, c):

col_names = tuple(val.columns.tolist())
# TODO: support other types like string and timestamp
col_types = get_hiframes_dtypes(val)
col_types = _infer_df_col_types(val)
index_type = _infer_index_type(val.index)
column_loc, _, _ = get_structure_maps(col_types, col_names)

Expand All @@ -82,8 +81,24 @@ def typeof_pd_dataframe(val, c):
def typeof_pd_series(val, c):
index_type = _infer_index_type(val.index)
is_named = val.name is not None

# attempt to define numba Series data type via Series values,
# if not successful, define it later via dtype in SeriesType init
underlying_type = None
try:
underlying_type = numba.typeof(val.values)
except ValueError:
pass

if not (isinstance(underlying_type, types.Array)
and not isinstance(underlying_type.dtype, types.PyObject)):
underlying_type = None

return SeriesType(
_infer_series_dtype(val), index=index_type, is_named=is_named)
dtype=_infer_series_dtype(val),
data=underlying_type,
index=index_type,
is_named=is_named)


@unbox(DataFrameType)
Expand Down Expand Up @@ -140,13 +155,13 @@ def unbox_dataframe(typ, val, c):
return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))


def get_hiframes_dtypes(df):
"""get hiframe data types for a pandas dataframe
"""
def _infer_df_col_types(df):
""" Infer column data types for a pandas DataFrame """

col_names = df.columns.tolist()
hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname]))
for cname in col_names]
return tuple(hi_typs)
col_typs = [numba.typeof(df[cname]).data for cname in col_names]

return tuple(col_typs)


def _infer_series_dtype(S):
Expand Down
29 changes: 28 additions & 1 deletion sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def test_impl():
self.assertEqual(hpat_func(), test_impl())

def test_create_with_series1(self):
""" Create pandas DataFrame from Series of different dtypes """
def test_impl(n):
A = pd.Series(np.ones(n, dtype=np.int64))
B = pd.Series(np.zeros(n, dtype=np.float64))
Expand All @@ -143,7 +144,7 @@ def test_impl(n):
pd.testing.assert_frame_equal(hpat_func(n), test_impl(n))

def test_create_with_series2(self):
# test creating dataframe from passed series
""" Test creating pandas DataFrame from passed Series """
def test_impl(A):
df = pd.DataFrame({'A': A})
return (df.A == 2).sum()
Expand All @@ -153,6 +154,18 @@ def test_impl(A):
df = pd.DataFrame({'A': np.arange(n)})
self.assertEqual(hpat_func(df.A), test_impl(df.A))

def test_create_with_series3(self):
""" Test creating pandas DataFrame from Series of different layouts """
def test_impl(A, B):
df = pd.DataFrame({'A': A, 'B': B})
return df.A.sum(), df.B.sum()
sdc_func = self.jit(test_impl)

n = 11
A = pd.Series(np.arange(n))
B = pd.Series(np.arange(2 * n)[::2])
self.assertEqual(sdc_func(A, B), test_impl(A, B))

def test_df_create_param_index_default(self):
def test_impl():
data = {'A': ['a', 'b'], 'B': [2, 3]}
Expand Down Expand Up @@ -219,6 +232,8 @@ def test_impl():
pd.testing.assert_frame_equal(hpat_func(), test_impl())

def test_pass_df1(self):
""" Test passing df with contiguous data layout """

def test_impl(df):
return (df.A == 2).sum()
hpat_func = self.jit(test_impl)
Expand All @@ -227,6 +242,18 @@ def test_impl(df):
df = pd.DataFrame({'A': np.arange(n)})
self.assertEqual(hpat_func(df), test_impl(df))

def test_pass_df_2(self):
""" Test passing df with non-contiguous data layout """

def test_impl(df):
return df.B.sum()
sdc_func = self.jit(test_impl)

n_rows, n_cols = 4, 6
col_names = list(string.ascii_uppercase[:n_cols])
df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=col_names)
self.assertAlmostEqual(sdc_func(df), test_impl(df))

def test_pass_df_str(self):
def test_impl(df):
return (df.A == 'a').sum()
Expand Down
48 changes: 43 additions & 5 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
gen_strlist,
_make_func_from_text)
from sdc.utilities.sdc_typing_utils import SDCLimitation
from sdc.hiframes.pd_series_type import SeriesType


_cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [
Expand Down Expand Up @@ -339,25 +340,43 @@ def test_impl(name):

pd.testing.assert_series_equal(hpat_func('A'), test_impl('A'))

@skip_numba_jit
def test_create_series_data_layouts(self):
def test_impl(data):
vals = pd.Series(data).values
return vals[0], vals[-1]
sdc_func = self.jit(test_impl)

n = 10
arrays_to_test = [
np.arange(n), # 'C' layout
np.arange(2 * n)[::2], # 'A' layout
# no 'F' layout for 1d arrays
]

for data in arrays_to_test:
with self.subTest(layout=numba.typeof(data).layout):
result = sdc_func(data)
result_ref = test_impl(data)
self.assertEqual(result, result_ref)

def test_pass_series1(self):
# TODO: check to make sure it is series type
def test_impl(A):
return (A == 2).sum()
hpat_func = self.jit(test_impl)
sdc_func = self.jit(test_impl)

n = 11
S = pd.Series(np.arange(n), name='A')
self.assertEqual(hpat_func(S), test_impl(S))
self.assertEqual(sdc_func(S), test_impl(S))
self.assertIsInstance(numba.typeof(S), SeriesType)

@skip_numba_jit
def test_pass_series_str(self):
def test_impl(A):
return (A == 'a').sum()
hpat_func = self.jit(test_impl)

S = pd.Series(['a', 'b', 'c'], name='A')
self.assertEqual(hpat_func(S), test_impl(S))
self.assertIsInstance(numba.typeof(S), SeriesType)

def test_pass_series_all_indexes(self):
def test_impl(A):
Expand All @@ -378,6 +397,25 @@ def test_impl(A):
S = pd.Series(np.arange(n), index, name='A')
pd.testing.assert_series_equal(hpat_func(S), test_impl(S))

def test_pass_series_data_layouts(self):
def test_impl(S):
vals = S.values
return vals[0], vals[-1]
sdc_func = self.jit(test_impl)

n = 10
series_to_test = [
pd.Series(np.arange(n)), # 'C' layout
pd.Series(np.arange(n))[::2], # 'A' layout
# no 'F' layout for Series
]

for s in series_to_test:
with self.subTest(layout=numba.typeof(s).data.layout):
result = sdc_func(s)
result_ref = test_impl(s)
self.assertEqual(result, result_ref)

def test_series_getattr_size(self):
def test_impl(S):
return S.size
Expand Down

0 comments on commit b55020d

Please sign in to comment.