-
Notifications
You must be signed in to change notification settings - Fork 61
Series combine #821
base: master
Are you sure you want to change the base?
Series combine #821
Changes from 15 commits
7ec05f3
b576e03
95d233e
2b95e9d
2104aae
aeef026
f09b792
fafcaed
b1ed1b3
4fdb369
6d930d8
c85feff
18771f2
8628e31
6f37f53
f66ace3
9fc55a7
e7dc1f5
6c78b8e
d623b89
ecd3dba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# ***************************************************************************** | ||
# Copyright (c) 2020, Intel Corporation All rights reserved. | ||
# | ||
# Redistribution and use in source and binary forms, with or without | ||
# modification, are permitted provided that the following conditions are met: | ||
# | ||
# Redistributions of source code must retain the above copyright notice, | ||
# this list of conditions and the following disclaimer. | ||
# | ||
# Redistributions in binary form must reproduce the above copyright notice, | ||
# this list of conditions and the following disclaimer in the documentation | ||
# and/or other materials provided with the distribution. | ||
# | ||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, | ||
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | ||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR | ||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | ||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | ||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; | ||
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, | ||
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR | ||
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, | ||
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
# ***************************************************************************** | ||
|
||
import pandas as pd | ||
from numba import njit | ||
|
||
|
||
@njit | ||
def series_combine(): | ||
s1 = pd.Series([1, 5, 2]) | ||
s2 = pd.Series([0, 3, 7, 8, 0]) | ||
|
||
return s1.combine(s2, max, fill_value=0) # Expect series of 1, 5, 7, 8, 0 | ||
|
||
|
||
print(series_combine()) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -45,6 +45,7 @@ | |||||
from numba.typed import List, Dict | ||||||
from numba import prange | ||||||
from numba.np.arraymath import get_isnan | ||||||
from numba.core.registry import cpu_target | ||||||
from pandas.core.indexing import IndexingError | ||||||
|
||||||
import sdc | ||||||
|
@@ -69,6 +70,7 @@ | |||||
from sdc.functions import numpy_like | ||||||
from sdc.hiframes.api import isna | ||||||
from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby | ||||||
from sdc.utilities.prange_utils import parallel_chunks | ||||||
|
||||||
from .pandas_series_functions import apply | ||||||
from .pandas_series_functions import map as _map | ||||||
|
@@ -4887,3 +4889,86 @@ def sdc_pandas_series_skew_impl(self, axis=None, skipna=None, level=None, numeri | |||||
return numpy_like.skew(self._data) | ||||||
|
||||||
return sdc_pandas_series_skew_impl | ||||||
|
||||||
|
||||||
@sdc_overload_method(SeriesType, 'combine') | ||||||
def sdc_pandas_series_combine(self, other, func, fill_value=None): | ||||||
""" | ||||||
Intel Scalable Dataframe Compiler User Guide | ||||||
******************************************** | ||||||
|
||||||
Pandas API: pandas.Series.combine | ||||||
|
||||||
Limitations | ||||||
----------- | ||||||
- Only supports the case when data in series of the same type. | ||||||
- With the default fill_value parameter value, the type of the resulting series will be float. | ||||||
|
||||||
Examples | ||||||
-------- | ||||||
.. literalinclude:: ../../../examples/series/series_combine.py | ||||||
:language: python | ||||||
:lines: 27- | ||||||
:caption: Combined the Series with a Series according to func. | ||||||
:name: ex_series_combine | ||||||
|
||||||
.. command-output:: python ./series/series_combine.py | ||||||
:cwd: ../../../examples | ||||||
|
||||||
Intel Scalable Dataframe Compiler Developer Guide | ||||||
************************************************* | ||||||
Pandas Series method :meth:`pandas.Series.combine` implementation. | ||||||
|
||||||
.. only:: developer | ||||||
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_combine* | ||||||
""" | ||||||
_func_name = 'Method Series.combine()' | ||||||
|
||||||
ty_checker = TypeChecker(_func_name) | ||||||
ty_checker.check(self, SeriesType) | ||||||
|
||||||
ty_checker.check(other, SeriesType) | ||||||
|
||||||
if not isinstance(fill_value, (types.Omitted, types.NoneType, types.Number)) and fill_value is not None: | ||||||
ty_checker.raise_exc(fill_value, 'number', 'fill_value') | ||||||
|
||||||
fill_is_default = isinstance(fill_value, (types.Omitted, types.NoneType)) or fill_value is None | ||||||
|
||||||
sig = func.get_call_type(cpu_target.typing_context, [self.dtype, other.dtype], {}) | ||||||
ret_type = sig.return_type | ||||||
|
||||||
fill_dtype = types.float64 if fill_is_default else fill_value | ||||||
res_dtype = find_common_dtype_from_numpy_dtypes([], [ret_type, fill_dtype]) | ||||||
|
||||||
def sdc_pandas_series_combine_impl(self, other, func, fill_value=None): | ||||||
|
||||||
if fill_value is not None: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
_fill_value = fill_value | ||||||
else: | ||||||
_fill_value = numpy.nan | ||||||
|
||||||
indexes, self_indexes, other_indexes = sdc_join_series_indexes(self.index, other.index) | ||||||
len_val = len(indexes) | ||||||
|
||||||
result = numpy.empty(len_val, res_dtype) | ||||||
|
||||||
chunks = parallel_chunks(len_val) | ||||||
for i in prange(len(chunks)): | ||||||
chunk = chunks[i] | ||||||
for j in range(chunk.start, chunk.stop): | ||||||
self_idx = self_indexes[j] | ||||||
if self_idx == -1: | ||||||
val_self = _fill_value | ||||||
else: | ||||||
val_self = self[self_idx]._data[0] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. self_idx (and other_idx) is position in the Series, not the index, so instead of using getitem on a Series, that performs index lookup and returns a Series, so that you have to take _data[0] from it, you can just write:
Suggested change
|
||||||
|
||||||
other_idx = other_indexes[j] | ||||||
if other_idx == -1: | ||||||
val_other = _fill_value | ||||||
else: | ||||||
val_other = other[other_idx]._data[0] | ||||||
|
||||||
result[j] = func(val_self, val_other) | ||||||
return pandas.Series(result, index=indexes) | ||||||
|
||||||
return sdc_pandas_series_combine_impl |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2761,7 +2761,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6., 7.]) | ||
np.testing.assert_array_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
|
@@ -2771,7 +2770,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6.0, 21., 3.6, 5.]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_float3264(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test has incorrect code, which should be corrected probably:
S2.dtype will be float64 on Win, not float32. Moreover, series dtype should be specified this way:
|
||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
|
@@ -2783,29 +2781,24 @@ def test_impl(S1, S2): | |
np.float32(3), np.float32(4), np.float32(5)]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_assert1(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
||
S1 = pd.Series([1, 2, 3]) | ||
S2 = pd.Series([6., 21., 3., 5.]) | ||
with self.assertRaises(AssertionError): | ||
hpat_func(S1, S2) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General comment for tests: not all combinations of input series dtypes and fill_value are tested e.g. the one I mentioned before - where float fill_value is assigned to otherwise int series. There are no tests with series with non-default indexes (we refer to samelen, but it's not fully correct - series may have same len, but not same indexes), and no tests for checking func impact on result dtype, so it's hard to see from such tests what's really tested and what is not. So the suggestion is to organize tests in a different manner:
New test: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For example, test 1 can look like this (it can also be split into two: one when we use check_dtype=False and one when we don't):
|
||
|
||
@skip_numba_jit | ||
def test_series_combine_assert2(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
||
S1 = pd.Series([6., 21., 3., 5.]) | ||
S2 = pd.Series([1, 2, 3]) | ||
with self.assertRaises(AssertionError): | ||
hpat_func(S1, S2) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_integer(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b, 16) | ||
|
@@ -2815,7 +2808,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6, 21, 3, 5]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_different_types(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
|
@@ -2825,8 +2817,10 @@ def test_impl(S1, S2): | |
S2 = pd.Series([1, 2, 3, 4, 5]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
@unittest.expectedFailure | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add comment why the test is skipped. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Rubtsowa No need to skip the test if that's how impl is intended to work. Use check_dtype=False in assert_series_equal and add a comment just before this check to refer to SDC Limitation. |
||
def test_series_combine_integer_samelen(self): | ||
"""Result series type `int` is expected, | ||
`float` is returned since this is the default fill_value type""" | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
hpat_func = self.jit(test_impl) | ||
|
@@ -2835,7 +2829,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6, 21, 17, -5, 4]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_samelen(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b) | ||
|
@@ -2845,7 +2838,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6.0, 21., 3.6, 5., 0.0]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_value(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) | ||
|
@@ -2855,7 +2847,6 @@ def test_impl(S1, S2): | |
S2 = pd.Series([6.0, 21., 3.6, 5.]) | ||
pd.testing.assert_series_equal(hpat_func(S1, S2), test_impl(S1, S2)) | ||
|
||
@skip_numba_jit | ||
def test_series_combine_value_samelen(self): | ||
def test_impl(S1, S2): | ||
return S1.combine(S2, lambda a, b: 2 * a + b, 1237.56) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This line is not correct - impl handles all cases. For the next line we need exact definition of difference to pandas, e.g: