Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Fixes incorrect definition of layout for SeriesType #1001

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions sdc/hiframes/boxing.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical
from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical
from sdc.hiframes.pd_series_ext import SeriesType
from sdc.hiframes.pd_series_type import _get_series_array_type
from sdc.hiframes.pd_dataframe_ext import get_structure_maps
from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types

Expand All @@ -70,7 +69,7 @@ def typeof_pd_dataframe(val, c):

col_names = tuple(val.columns.tolist())
# TODO: support other types like string and timestamp
col_types = get_hiframes_dtypes(val)
col_types = _infer_df_col_types(val)
index_type = _infer_index_type(val.index)
column_loc, _, _ = get_structure_maps(col_types, col_names)

Expand All @@ -82,8 +81,24 @@ def typeof_pd_dataframe(val, c):
def typeof_pd_series(val, c):
index_type = _infer_index_type(val.index)
is_named = val.name is not None

# attempt to define numba Series data type via Series values,
# if not successful, define it later via dtype in SeriesType init
underlying_type = None
try:
underlying_type = numba.typeof(val.values)
except ValueError:
pass

if not (isinstance(underlying_type, types.Array)
and not isinstance(underlying_type.dtype, types.PyObject)):
underlying_type = None

return SeriesType(
_infer_series_dtype(val), index=index_type, is_named=is_named)
dtype=_infer_series_dtype(val),
data=underlying_type,
index=index_type,
is_named=is_named)


@unbox(DataFrameType)
Expand Down Expand Up @@ -140,13 +155,13 @@ def unbox_dataframe(typ, val, c):
return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))


def get_hiframes_dtypes(df):
"""get hiframe data types for a pandas dataframe
"""
def _infer_df_col_types(df):
""" Infer column data types for a pandas DataFrame """

col_names = df.columns.tolist()
hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname]))
for cname in col_names]
return tuple(hi_typs)
col_typs = [numba.typeof(df[cname]).data for cname in col_names]

return tuple(col_typs)


def _infer_series_dtype(S):
Expand Down
29 changes: 28 additions & 1 deletion sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def test_impl():
self.assertEqual(hpat_func(), test_impl())

def test_create_with_series1(self):
""" Create pandas DataFrame from Series of different dtypes """
def test_impl(n):
A = pd.Series(np.ones(n, dtype=np.int64))
B = pd.Series(np.zeros(n, dtype=np.float64))
Expand All @@ -143,7 +144,7 @@ def test_impl(n):
pd.testing.assert_frame_equal(hpat_func(n), test_impl(n))

def test_create_with_series2(self):
# test creating dataframe from passed series
""" Test creating pandas DataFrame from passed Series """
def test_impl(A):
df = pd.DataFrame({'A': A})
return (df.A == 2).sum()
Expand All @@ -153,6 +154,18 @@ def test_impl(A):
df = pd.DataFrame({'A': np.arange(n)})
self.assertEqual(hpat_func(df.A), test_impl(df.A))

def test_create_with_series3(self):
""" Test creating pandas DataFrame from Series of different layouts """
def test_impl(A, B):
df = pd.DataFrame({'A': A, 'B': B})
return df.A.sum(), df.B.sum()
sdc_func = self.jit(test_impl)

n = 11
A = pd.Series(np.arange(n))
B = pd.Series(np.arange(2 * n)[::2])
self.assertEqual(sdc_func(A, B), test_impl(A, B))

def test_df_create_param_index_default(self):
def test_impl():
data = {'A': ['a', 'b'], 'B': [2, 3]}
Expand Down Expand Up @@ -219,6 +232,8 @@ def test_impl():
pd.testing.assert_frame_equal(hpat_func(), test_impl())

def test_pass_df1(self):
""" Test passing df with contiguous data layout """

def test_impl(df):
return (df.A == 2).sum()
hpat_func = self.jit(test_impl)
Expand All @@ -227,6 +242,18 @@ def test_impl(df):
df = pd.DataFrame({'A': np.arange(n)})
self.assertEqual(hpat_func(df), test_impl(df))

def test_pass_df_2(self):
""" Test passing df with non-contiguous data layout """

def test_impl(df):
return df.B.sum()
sdc_func = self.jit(test_impl)

n_rows, n_cols = 4, 6
col_names = list(string.ascii_uppercase[:n_cols])
df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=col_names)
self.assertAlmostEqual(sdc_func(df), test_impl(df))

def test_pass_df_str(self):
def test_impl(df):
return (df.A == 'a').sum()
Expand Down
48 changes: 43 additions & 5 deletions sdc/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
gen_strlist,
_make_func_from_text)
from sdc.utilities.sdc_typing_utils import SDCLimitation
from sdc.hiframes.pd_series_type import SeriesType


_cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [
Expand Down Expand Up @@ -339,25 +340,43 @@ def test_impl(name):

pd.testing.assert_series_equal(hpat_func('A'), test_impl('A'))

@skip_numba_jit
def test_create_series_data_layouts(self):
def test_impl(data):
vals = pd.Series(data).values
return vals[0], vals[-1]
sdc_func = self.jit(test_impl)

n = 10
arrays_to_test = [
np.arange(n), # 'C' layout
np.arange(2 * n)[::2], # 'A' layout
# no 'F' layout for 1d arrays
]

for data in arrays_to_test:
with self.subTest(layout=numba.typeof(data).layout):
result = sdc_func(data)
result_ref = test_impl(data)
self.assertEqual(result, result_ref)

def test_pass_series1(self):
# TODO: check to make sure it is series type
def test_impl(A):
return (A == 2).sum()
hpat_func = self.jit(test_impl)
sdc_func = self.jit(test_impl)

n = 11
S = pd.Series(np.arange(n), name='A')
self.assertEqual(hpat_func(S), test_impl(S))
self.assertEqual(sdc_func(S), test_impl(S))
self.assertIsInstance(numba.typeof(S), SeriesType)

@skip_numba_jit
def test_pass_series_str(self):
def test_impl(A):
return (A == 'a').sum()
hpat_func = self.jit(test_impl)

S = pd.Series(['a', 'b', 'c'], name='A')
self.assertEqual(hpat_func(S), test_impl(S))
self.assertIsInstance(numba.typeof(S), SeriesType)

def test_pass_series_all_indexes(self):
def test_impl(A):
Expand All @@ -378,6 +397,25 @@ def test_impl(A):
S = pd.Series(np.arange(n), index, name='A')
pd.testing.assert_series_equal(hpat_func(S), test_impl(S))

def test_pass_series_data_layouts(self):
def test_impl(S):
vals = S.values
return vals[0], vals[-1]
sdc_func = self.jit(test_impl)

n = 10
series_to_test = [
pd.Series(np.arange(n)), # 'C' layout
pd.Series(np.arange(n))[::2], # 'A' layout
# no 'F' layout for Series
]

for s in series_to_test:
with self.subTest(layout=numba.typeof(s).data.layout):
result = sdc_func(s)
result_ref = test_impl(s)
self.assertEqual(result, result_ref)

def test_series_getattr_size(self):
def test_impl(S):
return S.size
Expand Down