From 7ec05f3511ba0a1420ad2d67a85ea6d21df5986b Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 28 Apr 2020 13:30:27 +0300 Subject: [PATCH 01/15] Impl Series.combine --- examples/series/series_combine.py | 40 ++++++++++++ sdc/datatypes/hpat_pandas_series_functions.py | 62 +++++++++++++++++++ sdc/tests/test_series.py | 8 --- 3 files changed, 102 insertions(+), 8 deletions(-) create mode 100644 examples/series/series_combine.py diff --git a/examples/series/series_combine.py b/examples/series/series_combine.py new file mode 100644 index 000000000..cbcea968e --- /dev/null +++ b/examples/series/series_combine.py @@ -0,0 +1,40 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +from numba import njit + + +@njit +def series_copy(): + s1 = pd.Series([1, 5, 2]) + s2 = pd.Series([0, 3, 7, 8, 0]) + + return s1.combine(s2, max, fill_value=0) # Expect new series of 1, 5, 7, 8, 0 + + +print(series_copy()) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 5e7e60a27..d3f6bc145 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4818,3 +4818,65 @@ def sdc_pandas_series_groupby_impl(self, by=None, axis=0, level=None, as_index=T return init_series_groupby(self, by, grouped, sort) return sdc_pandas_series_groupby_impl + + +@sdc_overload_method(SeriesType, 'combine') +def sdc_pandas_series_combine(self, other, func, fill_value=None): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + + Pandas API: pandas.Series.combine + + Limitations + ----------- + - Only supports the case when data in series of the same type + + Examples + -------- + .. literalinclude:: ../../../examples/series/series_combine.py + :language: python + :lines: 27- + :caption: Combined the Series with a Series according to func. + :name: ex_series_combine + + .. command-output:: python ./series/series_combine.py + :cwd: ../../../examples + + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas Series method :meth:`pandas.Series.combine` implementation. + + .. only:: developer + + Tests: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* + """ + _func_name = 'Method Series.combine().' + + ty_checker = TypeChecker(_func_name) + ty_checker.check(self, SeriesType) + + ty_checker.check(other, SeriesType) + + if not isinstance(fill_value, (types.Omitted, types.NoneType, types.Number)) and fill_value is not None: + ty_checker.raise_exc(fill_value, 'number', 'fill_value') + + def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): + + if fill_value is None: + fill_value = numpy.nan + + len_val = max(len(self), len(other)) + result = numpy.empty(len_val, self._data.dtype) + for ind in range(len_val): + val_self = self._data[ind] + val_other = other._data[ind] + if len(self) < ind + 1: + val_self = fill_value + if len(other) < ind + 1: + val_other = fill_value + result[ind] = func(val_self, val_other) + + return pandas.Series(result) + + return sdc_pandas_series_combine_impl diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 73fb432af..0ceb63388 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2760,7 +2760,6 @@ def test_impl(S1, S2): S2 = pd.Series([6., 7.]) np.testing.assert_array_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2770,7 +2769,6 @@ def test_impl(S1, S2): S2 = pd.Series([6.0, 21., 3.6, 5.]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_float3264(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2804,7 +2802,6 @@ def test_impl(S1, S2): with self.assertRaises(AssertionError): hpat_func(S1, S2) - @skip_numba_jit def test_series_combine_integer(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b, 16) @@ -2814,7 +2811,6 @@ def test_impl(S1, S2): S2 = pd.Series([6, 21, 3, 5]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_different_types(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2824,7 +2820,6 @@ def test_impl(S1, S2): S2 = pd.Series([1, 2, 3, 4, 5]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_integer_samelen(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2834,7 +2829,6 @@ def test_impl(S1, S2): S2 = pd.Series([6, 21, 17, -5, 4]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_samelen(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2844,7 +2838,6 @@ def test_impl(S1, S2): S2 = pd.Series([6.0, 21., 3.6, 5., 0.0]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_value(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) @@ -2854,7 +2847,6 @@ def test_impl(S1, S2): S2 = pd.Series([6.0, 21., 3.6, 5.]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_value_samelen(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) From 95d233ed9761e6512c0ab4faa7f1324a8a613f1d Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 28 Apr 2020 13:40:25 +0300 Subject: [PATCH 02/15] change comment --- examples/series/series_combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/series/series_combine.py b/examples/series/series_combine.py index cbcea968e..dc11e598a 100644 --- a/examples/series/series_combine.py +++ b/examples/series/series_combine.py @@ -34,7 +34,7 @@ def series_copy(): s1 = pd.Series([1, 5, 2]) s2 = pd.Series([0, 3, 7, 8, 0]) - return s1.combine(s2, max, fill_value=0) # Expect new series of 1, 5, 7, 8, 0 + return s1.combine(s2, max, fill_value=0) # Expect series of 1, 5, 7, 8, 0 print(series_copy()) From 2104aaeec73839b9be23ec37b470c95d8393f1cb Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Wed, 13 May 2020 12:31:21 +0300 Subject: [PATCH 03/15] change example --- examples/series/series_combine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/series/series_combine.py b/examples/series/series_combine.py index dc11e598a..de212677e 100644 --- a/examples/series/series_combine.py +++ b/examples/series/series_combine.py @@ -24,17 +24,16 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -import numpy as np import pandas as pd from numba import njit @njit -def series_copy(): +def series_combine(): s1 = pd.Series([1, 5, 2]) s2 = pd.Series([0, 3, 7, 8, 0]) return s1.combine(s2, max, fill_value=0) # Expect series of 1, 5, 7, 8, 0 -print(series_copy()) +print(series_combine()) From aeef026002d4da4db5b8e15a11df0384bb36f5d6 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Wed, 13 May 2020 15:36:18 +0300 Subject: [PATCH 04/15] use sdc_join_series_indexes --- sdc/datatypes/hpat_pandas_series_functions.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 58e9a1c17..7721a3fd3 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4930,17 +4930,23 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): if fill_value is None: fill_value = numpy.nan - len_val = max(len(self), len(other)) + indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index) + len_val = len(indexes) result = numpy.empty(len_val, self._data.dtype) - for ind in range(len_val): - val_self = self._data[ind] - val_other = other._data[ind] - if len(self) < ind + 1: + + for i in range(len_val): + if self_indexes[i] == -1: val_self = fill_value - if len(other) < ind + 1: + else: + ind_self = self_indexes[i] + val_self = self._data[ind_self] + if other_indexes[i] == -1: val_other = fill_value - result[ind] = func(val_self, val_other) + else: + ind_other = other_indexes[i] + val_other = other._data[ind_other] + result[i] = func(val_self, val_other) - return pandas.Series(result) + return pandas.Series(result, index=indexes) return sdc_pandas_series_combine_impl From f09b79284eb4949d9c8a5189b7f3ba9c08f72a39 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Mon, 18 May 2020 11:36:46 +0300 Subject: [PATCH 05/15] change --- sdc/datatypes/hpat_pandas_series_functions.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 7721a3fd3..627d09756 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -67,6 +67,7 @@ from sdc.functions import numpy_like from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby +from sdc.utilities.prange_utils import parallel_chunks from .pandas_series_functions import apply from .pandas_series_functions import map as _map @@ -4934,18 +4935,24 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): len_val = len(indexes) result = numpy.empty(len_val, self._data.dtype) - for i in range(len_val): - if self_indexes[i] == -1: - val_self = fill_value - else: - ind_self = self_indexes[i] - val_self = self._data[ind_self] - if other_indexes[i] == -1: - val_other = fill_value - else: - ind_other = other_indexes[i] - val_other = other._data[ind_other] - result[i] = func(val_self, val_other) + chunks = parallel_chunks(len_val) + for i in prange(len(chunks)): + chunk = chunks[i] + for j in (chunk.start, chunk.stop): + + if self_indexes[j] == -1: + val_self = fill_value + else: + ind_self = self_indexes[j] + val_self = self._data[ind_self] + + if other_indexes[j] == -1: + val_other = fill_value + else: + ind_other = other_indexes[j] + val_other = other._data[ind_other] + + result[j] = func(val_self, val_other) return pandas.Series(result, index=indexes) From b1ed1b32efc99c6cfd91b8e68cb1ba215d2c7d88 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Mon, 25 May 2020 16:54:20 +0300 Subject: [PATCH 06/15] for from chunks, change dtype for result array --- sdc/datatypes/hpat_pandas_series_functions.py | 42 ++++++++++++------- sdc/tests/test_series.py | 9 ++-- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index e519e5917..2e2f3a558 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -45,6 +45,7 @@ from numba.typed import List, Dict from numba import prange from numba.np.arraymath import get_isnan +from numba.core.registry import cpu_target from pandas.core.indexing import IndexingError import sdc @@ -4900,7 +4901,8 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): Limitations ----------- - - Only supports the case when data in series of the same type + - Only supports the case when data in series of the same type. + - With the default fill_value parameter value, the type of the resulting series will be float. Examples -------- @@ -4918,10 +4920,9 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): Pandas Series method :meth:`pandas.Series.combine` implementation. .. only:: developer - - Tests: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* + Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* """ - _func_name = 'Method Series.combine().' + _func_name = 'Method Series.combine()' ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) @@ -4931,31 +4932,40 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): if not isinstance(fill_value, (types.Omitted, types.NoneType, types.Number)) and fill_value is not None: ty_checker.raise_exc(fill_value, 'number', 'fill_value') + fill_is_default = isinstance(fill_value, (types.Omitted, types.NoneType)) or fill_value is None + + sig = func.get_call_type(cpu_target.typing_context, [self.dtype, other.dtype], {}) + ret_type = sig.return_type + + fill_dtype = types.float64 if fill_is_default else fill_value + res_dtype = find_common_dtype_from_numpy_dtypes([], [ret_type, fill_dtype]) + def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): - if fill_value is None: - fill_value = numpy.nan + if fill_value is not None: + _fill_value = fill_value + else: + _fill_value = numpy.nan indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index) len_val = len(indexes) - result = numpy.empty(len_val, self._data.dtype) + + result = numpy.empty(len_val, res_dtype) chunks = parallel_chunks(len_val) for i in prange(len(chunks)): chunk = chunks[i] - for j in (chunk.start, chunk.stop): + for j in (chunk.start, chunk.stop-1): + val_self = _fill_value + val_other = _fill_value - if self_indexes[j] == -1: - val_self = fill_value - else: + if self_indexes[j] != -1: ind_self = self_indexes[j] - val_self = self._data[ind_self] + val_self = self[ind_self]._data[0] - if other_indexes[j] == -1: - val_other = fill_value - else: + if other_indexes[j] != -1: ind_other = other_indexes[j] - val_other = other._data[ind_other] + val_other = other[ind_other]._data[0] result[j] = func(val_self, val_other) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 200d919d9..f0cad01f4 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2781,7 +2781,6 @@ def test_impl(S1, S2): np.float32(3), np.float32(4), np.float32(5)]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_assert1(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2789,10 +2788,8 @@ def test_impl(S1, S2): S1 = pd.Series([1, 2, 3]) S2 = pd.Series([6., 21., 3., 5.]) - with self.assertRaises(AssertionError): - hpat_func(S1, S2) + pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @skip_numba_jit def test_series_combine_assert2(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) @@ -2800,8 +2797,7 @@ def test_impl(S1, S2): S1 = pd.Series([6., 21., 3., 5.]) S2 = pd.Series([1, 2, 3]) - with self.assertRaises(AssertionError): - hpat_func(S1, S2) + pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) def test_series_combine_integer(self): def test_impl(S1, S2): @@ -2821,6 +2817,7 @@ def test_impl(S1, S2): S2 = pd.Series([1, 2, 3, 4, 5]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + @unittest.expectedFailure def test_series_combine_integer_samelen(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) From 6d930d89f98ff61ea22f89f38e31616627548f4a Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 26 May 2020 12:22:59 +0300 Subject: [PATCH 07/15] change 'if' on 'if-else' --- sdc/datatypes/hpat_pandas_series_functions.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 2e2f3a558..141a331e4 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4956,14 +4956,15 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): for i in prange(len(chunks)): chunk = chunks[i] for j in (chunk.start, chunk.stop-1): - val_self = _fill_value - val_other = _fill_value - - if self_indexes[j] != -1: + if self_indexes[j] == -1: + val_self = _fill_value + else: ind_self = self_indexes[j] val_self = self[ind_self]._data[0] - if other_indexes[j] != -1: + if other_indexes[j] == -1: + val_other = _fill_value + else: ind_other = other_indexes[j] val_other = other[ind_other]._data[0] From c85fefff13508fe9541a667cfd49ab806640a9b0 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 26 May 2020 13:03:30 +0300 Subject: [PATCH 08/15] change for --- sdc/datatypes/hpat_pandas_series_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 141a331e4..288d59286 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4955,7 +4955,7 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): chunks = parallel_chunks(len_val) for i in prange(len(chunks)): chunk = chunks[i] - for j in (chunk.start, chunk.stop-1): + for j in (chunk.start, chunk.stop-2): if self_indexes[j] == -1: val_self = _fill_value else: From 18771f28c8d14f51b0eeb2f2e6bc61520e928df6 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 26 May 2020 13:36:23 +0300 Subject: [PATCH 09/15] change for --- sdc/datatypes/hpat_pandas_series_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 288d59286..ec49a9bba 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4955,7 +4955,7 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): chunks = parallel_chunks(len_val) for i in prange(len(chunks)): chunk = chunks[i] - for j in (chunk.start, chunk.stop-2): + for j in (chunk.start, chunk.stop): if self_indexes[j] == -1: val_self = _fill_value else: From 8628e319bb52e2e0dc0a30da44bd4e27cead9177 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 26 May 2020 15:05:18 +0300 Subject: [PATCH 10/15] change for --- sdc/datatypes/hpat_pandas_series_functions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index ec49a9bba..74d0ba9fa 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4955,7 +4955,7 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): chunks = parallel_chunks(len_val) for i in prange(len(chunks)): chunk = chunks[i] - for j in (chunk.start, chunk.stop): + for j in range(chunk.start, chunk.stop): if self_indexes[j] == -1: val_self = _fill_value else: @@ -4969,7 +4969,6 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): val_other = other[ind_other]._data[0] result[j] = func(val_self, val_other) - return pandas.Series(result, index=indexes) return sdc_pandas_series_combine_impl From 6f37f53f7ea54006e4aa0723bec063b0b1256e2d Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Tue, 26 May 2020 16:32:05 +0300 Subject: [PATCH 11/15] add comment in skip test, some change in impl --- sdc/datatypes/hpat_pandas_series_functions.py | 12 ++++++------ sdc/tests/test_series.py | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 74d0ba9fa..56be13400 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4956,17 +4956,17 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): for i in prange(len(chunks)): chunk = chunks[i] for j in range(chunk.start, chunk.stop): - if self_indexes[j] == -1: + self_idx = self_indexes[j] + if self_idx == -1: val_self = _fill_value else: - ind_self = self_indexes[j] - val_self = self[ind_self]._data[0] + val_self = self[self_idx]._data[0] - if other_indexes[j] == -1: + other_idx = other_indexes[j] + if other_idx == -1: val_other = _fill_value else: - ind_other = other_indexes[j] - val_other = other[ind_other]._data[0] + val_other = other[other_idx]._data[0] result[j] = func(val_self, val_other) return pandas.Series(result, index=indexes) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index f0cad01f4..e44f5e6ef 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2819,6 +2819,8 @@ def test_impl(S1, S2): @unittest.expectedFailure def test_series_combine_integer_samelen(self): + """Result series type `int` is expected, + `float` is returned since this is the default fill_value type""" def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) hpat_func = self.jit(test_impl) From f66ace305242e1261ed598f2e82607cfc801e4ab Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Wed, 27 May 2020 12:40:17 +0300 Subject: [PATCH 12/15] change if-else in 1 line --- sdc/datatypes/hpat_pandas_series_functions.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 56be13400..f728245f7 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4942,10 +4942,7 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): - if fill_value is not None: - _fill_value = fill_value - else: - _fill_value = numpy.nan + _fill_value = numpy.nan if fill_value is None else fill_value indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index) len_val = len(indexes) @@ -4957,16 +4954,10 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): chunk = chunks[i] for j in range(chunk.start, chunk.stop): self_idx = self_indexes[j] - if self_idx == -1: - val_self = _fill_value - else: - val_self = self[self_idx]._data[0] + val_self = _fill_value if self_idx == -1 else self._data[self_idx] other_idx = other_indexes[j] - if other_idx == -1: - val_other = _fill_value - else: - val_other = other[other_idx]._data[0] + val_other = _fill_value if other_idx == -1 else other._data[other_idx] result[j] = func(val_self, val_other) return pandas.Series(result, index=indexes) From e7dc1f590002638d497130ce3a7c4496301f3965 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Wed, 27 May 2020 12:52:37 +0300 Subject: [PATCH 13/15] change test --- sdc/tests/test_series.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index e44f5e6ef..6e9987e81 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2817,17 +2817,15 @@ def test_impl(S1, S2): S2 = pd.Series([1, 2, 3, 4, 5]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @unittest.expectedFailure def test_series_combine_integer_samelen(self): - """Result series type `int` is expected, - `float` is returned since this is the default fill_value type""" def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) hpat_func = self.jit(test_impl) S1 = pd.Series([1, 2, 3, 4, 5]) S2 = pd.Series([6, 21, 17, -5, 4]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + # check_dtype=False due to limitation of combine impl + pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2), check_dtype=False) def test_series_combine_samelen(self): def test_impl(S1, S2): From 6c78b8ee6d2cae99f8bab7cf6dbbab85b58ae2ba Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Mon, 1 Jun 2020 16:01:14 +0300 Subject: [PATCH 14/15] change tests --- sdc/datatypes/hpat_pandas_series_functions.py | 25 +-- sdc/tests/test_series.py | 167 +++++++++++------- 2 files changed, 117 insertions(+), 75 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index f728245f7..9ccb95b25 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4891,7 +4891,7 @@ def sdc_pandas_series_skew_impl(self, axis=None, skipna=None, level=None, numeri return sdc_pandas_series_skew_impl -@sdc_overload_method(SeriesType, 'combine') +@sdc_overload_method(SeriesType, 'combine', jit_options={'error_model': 'numpy'}) def sdc_pandas_series_combine(self, other, func, fill_value=None): """ Intel Scalable Dataframe Compiler User Guide @@ -4901,8 +4901,12 @@ def sdc_pandas_series_combine(self, other, func, fill_value=None): Limitations ----------- - - Only supports the case when data in series of the same type. - - With the default fill_value parameter value, the type of the resulting series will be float. + - Resulting series dtype may be wider than in pandas due to + type-stability requirements and depends on fill_value dtype + and result of series indexes alignment. + - Indixes should be strictly ascending, as inside the function + they are sorted in ascending order and the answer becomes + different from the result of the pandas. Examples -------- @@ -4949,17 +4953,14 @@ def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): result = numpy.empty(len_val, res_dtype) - chunks = parallel_chunks(len_val) - for i in prange(len(chunks)): - chunk = chunks[i] - for j in range(chunk.start, chunk.stop): - self_idx = self_indexes[j] - val_self = _fill_value if self_idx == -1 else self._data[self_idx] + for i in prange(len_val): + self_idx, other_idx = self_indexes[i], other_indexes[i] + val_self = _fill_value if self_idx == -1 else self._data[self_idx] - other_idx = other_indexes[j] - val_other = _fill_value if other_idx == -1 else other._data[other_idx] + val_other = _fill_value if other_idx == -1 else other._data[other_idx] + + result[i] = func(val_self, val_other) - result[j] = func(val_self, val_other) return pandas.Series(result, index=indexes) return sdc_pandas_series_combine_impl diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 6e9987e81..0921aafc5 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2770,89 +2770,130 @@ def test_impl(S1, S2): S2 = pd.Series([6.0, 21., 3.6, 5.]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - def test_series_combine_float3264(self): + @unittest.expectedFailure #https://github.com/numba/numba/issues/5792 + def test_series_combine_div(self): def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b) - hpat_func = self.jit(test_impl) - - S1 = pd.Series([np.float64(1), np.float64(2), - np.float64(3), np.float64(4), np.float64(5)]) - S2 = pd.Series([np.float32(1), np.float32(2), - np.float32(3), np.float32(4), np.float32(5)]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + return S1.combine(S2, lambda a, b: a/b, 0) - def test_series_combine_assert1(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b) hpat_func = self.jit(test_impl) - S1 = pd.Series([1, 2, 3]) - S2 = pd.Series([6., 21., 3., 5.]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - - def test_series_combine_assert2(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b) - hpat_func = self.jit(test_impl) + sizes1 = [2, 4, 5, 6, 8] + sizes2 = [1, 3, 5, 7, 9] + series_dtypes = [None, np.int64, np.float64] - S1 = pd.Series([6., 21., 3., 5.]) - S2 = pd.Series([1, 2, 3]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + for n in sizes1: + for k in sizes2: + for dtype1, dtype2 in product(series_dtypes, series_dtypes): + A = np.random.randint(-100, 100, n) + B = np.arange(k) * 2 + 1 + S1 = pd.Series(A, dtype=dtype1) + S2 = pd.Series(B, dtype=dtype2) + with self.subTest(S1=S1, S2=S2): + result = hpat_func(S1, S2) + result_ref = test_impl(S1, S2) + # check_dtype=False due to difference to pandas in some cases + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) - def test_series_combine_integer(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b, 16) - hpat_func = self.jit(test_impl) - - S1 = pd.Series([1, 2, 3, 4, 5]) - S2 = pd.Series([6, 21, 3, 5]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - - def test_series_combine_different_types(self): + def test_series_combine_value(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: 2 * a + b) hpat_func = self.jit(test_impl) - S1 = pd.Series([6.1, 21.2, 3.3, 5.4, 6.7]) - S2 = pd.Series([1, 2, 3, 4, 5]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + series_indexes = [[1, 2, 3, 4, 5], + [4, 5, 7, 8, 9], + [0, 1, 7, 13, 25]] + # Only indixes ascending due to difference to pandas in some cases - def test_series_combine_integer_samelen(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b) - hpat_func = self.jit(test_impl) - - S1 = pd.Series([1, 2, 3, 4, 5]) - S2 = pd.Series([6, 21, 17, -5, 4]) - # check_dtype=False due to limitation of combine impl - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2), check_dtype=False) + n = 5 + np.random.seed(0) + A = np.random.randint(-100, 100, n) + B = np.arange(n) * 2 + 1 + + series_dtypes = [None, np.int64, np.float64] + fill_values = [None, np.nan, 4, 4.2] + for dtype1, dtype2 in product(series_dtypes, series_dtypes): + for series_index1 in series_indexes: + for series_index2 in series_indexes: + S1 = pd.Series(A, index=series_index1, dtype=dtype1) + S2 = pd.Series(B, index=series_index2, dtype=dtype2) + with self.subTest(S1=S1, S2=S2): + result = hpat_func(S1, S2) + result_ref = test_impl(S1, S2) + # check_dtype=False due to difference to pandas in some cases + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) + + def test_series_combine_value_with_fill_value(self): + def test_impl(S1, S2, fill_value): + return S1.combine(S2, lambda a, b: 2 * a + b, fill_value) + hpat_func = self.jit(test_impl) + + series_indexes = [[1, 2, 3, 4, 5], + [4, 5, 7, 8, 9], + [0, 1, 7, 13, 25]] + # Only indixes ascending due to difference to pandas in some cases + + n = 5 + np.random.seed(0) + A = np.random.randint(-100, 100, n) + B = np.arange(n) * 2 + 1 + + series_dtypes = [None, np.int64, np.float64] + fill_values = [None, np.nan, 4, 4.2] + for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values): + for series_index1 in series_indexes: + for series_index2 in series_indexes: + S1 = pd.Series(A, index=series_index1, dtype=dtype1) + S2 = pd.Series(B, index=series_index2, dtype=dtype2) + with self.subTest(S1=S1, S2=S2, fill_value=fill_value): + result = hpat_func(S1, S2, fill_value) + result_ref = test_impl(S1, S2, fill_value) + # check_dtype=False due to difference to pandas in some cases + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) - def test_series_combine_samelen(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b) + def test_series_combine_value_samelen(self): + def test_impl(S1, S2, fill_value): + return S1.combine(S2, lambda a, b: 2 * a + b, fill_value=fill_value) hpat_func = self.jit(test_impl) - S1 = pd.Series([1.0, 2., 3., 4., 5.]) - S2 = pd.Series([6.0, 21., 3.6, 5., 0.0]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + n = 11 + np.random.seed(0) + A = np.random.randint(-100, 100, n) + B = np.arange(n) * 2 + 1 + series_index = 1 + np.arange(n) + + series_dtypes = [None, np.int64, np.float64] + fill_values = [None, np.nan, 4, 4.2] + for dtype1, dtype2, fill_value in product(series_dtypes, series_dtypes, fill_values): + S1 = pd.Series(A, index=series_index, dtype=dtype1) + S2 = pd.Series(B, index=series_index, dtype=dtype2) + with self.subTest(S1=S1, S2=S2, fill_value=fill_value): + result = hpat_func(S1, S2, fill_value) + result_ref = test_impl(S1, S2, fill_value) + # check_dtype=False due to difference to pandas in some cases + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) - def test_series_combine_value(self): + def test_series_combine_different_types(self): def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) + return S1.combine(S2, lambda a, b: 2 * a + b) hpat_func = self.jit(test_impl) - S1 = pd.Series([1.0, 2., 3., 4., 5.]) - S2 = pd.Series([6.0, 21., 3.6, 5.]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) + sizes1 = [2, 4, 5, 6, 8] + sizes2 = [1, 3, 5, 7, 9] + series_dtypes = [None, np.int64, np.float64] - def test_series_combine_value_samelen(self): - def test_impl(S1, S2): - return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) - hpat_func = self.jit(test_impl) + for n in sizes1: + for k in sizes2: + for dtype1, dtype2 in product(series_dtypes, series_dtypes): + A = np.random.randint(-100, 100, n) + B = np.arange(k) * 2 + 1 + S1 = pd.Series(A, dtype=dtype1) + S2 = pd.Series(B, dtype=dtype2) + with self.subTest(S1=S1, S2=S2): + result = hpat_func(S1, S2) + result_ref = test_impl(S1, S2) + # check_dtype=False due to difference to pandas in some cases + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) - S1 = pd.Series([1.0, 2., 3., 4., 5.]) - S2 = pd.Series([6.0, 21., 3.6, 5., 0.0]) - pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) def test_series_abs1(self): def test_impl(S): From d623b898d4c303476f535bc4c4cb2b3e221d4138 Mon Sep 17 00:00:00 2001 From: Rubtsowa Date: Mon, 1 Jun 2020 16:08:27 +0300 Subject: [PATCH 15/15] fix problem with PEP8 --- sdc/tests/test_series.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 0921aafc5..f421e4362 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2770,7 +2770,8 @@ def test_impl(S1, S2): S2 = pd.Series([6.0, 21., 3.6, 5.]) pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) - @unittest.expectedFailure #https://github.com/numba/numba/issues/5792 + @unittest.expectedFailure + # https://github.com/numba/numba/issues/5792 def test_series_combine_div(self): def test_impl(S1, S2): return S1.combine(S2, lambda a, b: a/b, 0)