Use argument dtype to inform coercion #17779

TomAugspurger · 2017-10-04T13:03:33Z

Master:

>>> import dask.dataframe as dd

>>> s = dd.core.Scalar({('s', 0): 10}, 's', 'i8')
>>> pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
...                     'b': [7, 6, 5, 4, 3, 2, 1]})
>>> (pdf + s).dtypes
a    object
b    object
dtype: object

Head:

>>> (pdf + s).dtypes
a    int64
b    int64
dtype: object

This is more consistent with 0.20.3, while keeping the benefits from #16821

Closes #17767

I may play around with this a bit more. I feel like this is a bit too numpy specific, for example, adding a DataFrame and a pa.Scalar still fails:

In [39]: x = pa.array([1])[0]

In [40]: df = pd.DataFrame({"A": [1, 2]})

In [41]: df + x
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
   1292         try:
-> 1293             values, values_mask, other, other_mask = self._try_coerce_args(
   1294                 transf(values), other)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in _try_coerce_args(self, values, other)
    677             raise TypeError("cannot convert {} to an {}".format(
--> 678                 type(other).__name__,
    679                 type(self).__name__.lower().replace('Block', '')))

TypeError: cannot convert Int64Value to an intblock

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/ops.py in na_op(x, y)
   1199             result = expressions.evaluate(op, str_rep, x, y,
-> 1200                                           raise_on_error=True, **eval_kwargs)
   1201         except TypeError:

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/computation/expressions.py in evaluate(op, op_str, a, b, raise_on_error, use_numexpr, **eval_kwargs)
    210         return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error,
--> 211                          **eval_kwargs)
    212     return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/computation/expressions.py in _evaluate_standard(op, op_str, a, b, raise_on_error, **eval_kwargs)
     63     with np.errstate(all='ignore'):
---> 64         return op(a, b)
     65

TypeError: unsupported operand type(s) for +: 'int' and 'pyarrow.lib.Int64Value'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
   1351         try:
-> 1352             with np.errstate(all='ignore'):
   1353                 result = get_result(other)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in get_result(other)
   1320                 result = func(values, other)
-> 1321             else:
   1322                 result = func(values, other)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/ops.py in na_op(x, y)
   1225                     with np.errstate(all='ignore'):
-> 1226                         result[mask] = op(xrav, y)
   1227             else:

TypeError: unsupported operand type(s) for +: 'int' and 'pyarrow.lib.Int64Value'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-41-a01bb1dd67a6> in <module>()
----> 1 df + x

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/ops.py in f(self, other, axis, level, fill_value)
   1263                 self = self.fillna(fill_value)
   1264
-> 1265             return self._combine_const(other, na_op)
   1266
   1267     f.__name__ = name

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/frame.py in _combine_const(self, other, func, raise_on_error, try_cast)
   3836         new_data = self._data.eval(func=func, other=other,
   3837                                    raise_on_error=raise_on_error,
-> 3838                                    try_cast=try_cast)
   3839         return self._constructor(new_data)
   3840

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in eval(self, **kwargs)
   3375         return self.apply('where', **kwargs)
   3376
-> 3377     def eval(self, **kwargs):
   3378         return self.apply('eval', **kwargs)
   3379

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
   3269                                                  copy=align_copy)
   3270
-> 3271             kwargs['mgr'] = self
   3272             applied = getattr(b, f)(**kwargs)
   3273             result_blocks = _extend_blocks(applied, result_blocks)

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
   1296             block = self.coerce_to_target_dtype(orig_other)
   1297             return block.eval(func, orig_other,
-> 1298                               raise_on_error=raise_on_error,
   1299                               try_cast=try_cast, mgr=mgr)
   1300

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in eval(self, func, other, raise_on_error, try_cast, mgr)
   1357         except ValueError as detail:
   1358             raise
-> 1359         except Exception as detail:
   1360             result = handle_error()
   1361

~/Envs/pandas-dev/lib/python3.6/site-packages/pandas/pandas/core/internals.py in handle_error()
   1340             if raise_on_error:
   1341                 # The 'detail' variable is defined in outer scope.
-> 1342                 raise TypeError('Could not operate %s with block values %s' %
   1343                                 (repr(other), str(detail)))  # noqa
   1344             else:

TypeError: Could not operate 1 with block values unsupported operand type(s) for +: 'int' and 'pyarrow.lib.Int64Value'

Do we want that to work?

TomAugspurger · 2017-10-04T13:09:15Z

cc @wesm for the arrow part.

The background is that dask has scalar objects that duck-type numpy scalars. We'd like df + scalar to know that scalar has a compatible dtype, and so doesn't have to cast df to object dtype. My current fix is pretty specific to objects having a .dtype attribute that's similar to a numpy.dtype.

jreback

seems reasonable if all passing

maybe want to check some perf
but i doubt it affect things

jreback · 2017-10-04T13:38:22Z

pandas/core/dtypes/cast.py

@@ -483,6 +483,39 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
    return arr.dtype, arr


+def _maybe_infer_dtype_type(element):


deprivatize (no leading _)

Master: ```python >>> import dask.dataframe as dd >>> s = dd.core.Scalar({('s', 0): 10}, 's', 'i8') >>> pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], ... 'b': [7, 6, 5, 4, 3, 2, 1]}) >>> (pdf + s).dtypes a object b object dtype: object Head: ``` >>> (pdf + s).dtypes a int64 b int64 dtype: object ``` This is more consistent with 0.20.3, while still most of the changes in pandas-dev#16821 Closes pandas-dev#17767

codecov · 2017-10-04T22:33:47Z

Codecov Report

Merging #17779 into master will decrease coverage by 0.01%.
The diff coverage is 100%.

@@            Coverage Diff             @@
##           master   #17779      +/-   ##
==========================================
- Coverage   91.24%   91.22%   -0.02%     
==========================================
  Files         163      163              
  Lines       49916    49923       +7     
==========================================
- Hits        45544    45542       -2     
- Misses       4372     4381       +9

Flag	Coverage Δ
#multiple	`89.02% <100%> (ø)`	⬆️
#single	`40.26% <46.42%> (-0.05%)`	⬇️

Impacted Files	Coverage Δ
pandas/core/dtypes/cast.py	`87.99% <100%> (+0.17%)`	⬆️
pandas/core/internals.py	`94.37% <100%> (-0.01%)`	⬇️
pandas/io/gbq.py	`25% <0%> (-58.34%)`	⬇️
pandas/core/frame.py	`97.73% <0%> (-0.1%)`	⬇️
pandas/core/indexes/period.py	`92.78% <0%> (+0.01%)`	⬆️
pandas/io/excel.py	`80.4% <0%> (+0.02%)`	⬆️

Continue to review full report at Codecov.

Legend - Click here to learn more
Δ = absolute <relative> (impact), ø = not affected, ? = missing data
Powered by Codecov. Last update 48d0460...7ed82be. Read the comment docs.

jreback · 2017-10-04T23:49:02Z

pandas/tests/internals/test_internals.py

+        (1.0, 'f8'),
+        (1j, 'complex128'),
+        (True, 'bool'),
+        # (np.timedelta64(20, 'ns'), '<m8[ns]'),


is this commented for a reason?

Fixed, forgot to uncomment it while testing.

jreback

lgtm. just a small comment. merge when satisfied.

wesm · 2017-10-05T00:33:24Z

@TomAugspurger no one's put in any effort on making the pyarrow scalar types usable in any kind of analytical setting, they're mostly for display purposes and coercion to Python types for the moment. We will need to address it at some point in the not-too-distant future

TomAugspurger · 2017-10-05T01:58:39Z

We will need to address it at some point in the not-too-distant future

Ok. I don't think the changes here do anything to help with that, but they shouldn't make anything more difficult.

jreback · 2017-10-05T10:25:08Z

pandas/tests/internals/test_internals.py

+        (1.0, 'f8'),
+        (1j, 'complex128'),
+        (True, 'bool'),
+        (np.timedelta64(20, 'ns'), '<m8[ns]'),


actually ought to add Timedelta/Timestamp, NaT, np.nan, Period

The missing data ones are already covered in test_nanops. Timedelta / timestamps in handled in tests/scalar/. Period doesn't support binops. So we should be good.

* Use argument dtype to inform coercion Master: ```python >>> import dask.dataframe as dd >>> s = dd.core.Scalar({('s', 0): 10}, 's', 'i8') >>> pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7], ... 'b': [7, 6, 5, 4, 3, 2, 1]}) >>> (pdf + s).dtypes a object b object dtype: object Head: ``` >>> (pdf + s).dtypes a int64 b int64 dtype: object ``` This is more consistent with 0.20.3, while still most of the changes in pandas-dev#16821 Closes pandas-dev#17767 * Compat for older numpy where bool(dtype) is False * Added timedelta

TomAugspurger added API Design Dtype Conversions Unexpected or buggy dtype conversions labels Oct 4, 2017

TomAugspurger added this to the 0.21.0 milestone Oct 4, 2017

jreback approved these changes Oct 4, 2017

View reviewed changes

TomAugspurger force-pushed the can_hold_type-coercion branch from a3a2ee2 to b34c61f Compare October 4, 2017 18:24

Compat for older numpy where bool(dtype) is False

fe4fdb6

jreback reviewed Oct 4, 2017

View reviewed changes

jreback approved these changes Oct 4, 2017

View reviewed changes

Added timedelta

7ed82be

jreback reviewed Oct 5, 2017

View reviewed changes

TomAugspurger merged commit 6773694 into pandas-dev:master Oct 5, 2017

TomAugspurger deleted the can_hold_type-coercion branch October 5, 2017 12:43

TomAugspurger mentioned this pull request Oct 5, 2017

Pandas 0.21 compatability dask/dask#2737

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Use argument dtype to inform coercion #17779

Use argument dtype to inform coercion #17779

TomAugspurger commented Oct 4, 2017 •

edited

Loading

TomAugspurger commented Oct 4, 2017

jreback left a comment

jreback Oct 4, 2017

codecov bot commented Oct 4, 2017 •

edited

Loading

jreback Oct 4, 2017

TomAugspurger Oct 5, 2017

jreback left a comment

wesm commented Oct 5, 2017

TomAugspurger commented Oct 5, 2017

jreback Oct 5, 2017

TomAugspurger Oct 5, 2017

		@@ -483,6 +483,39 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
		return arr.dtype, arr


		def _maybe_infer_dtype_type(element):

Use argument dtype to inform coercion #17779

Use argument dtype to inform coercion #17779

Conversation

TomAugspurger commented Oct 4, 2017 • edited Loading

TomAugspurger commented Oct 4, 2017

jreback left a comment

Choose a reason for hiding this comment

jreback Oct 4, 2017

Choose a reason for hiding this comment

codecov bot commented Oct 4, 2017 • edited Loading

Codecov Report

jreback Oct 4, 2017

Choose a reason for hiding this comment

TomAugspurger Oct 5, 2017

Choose a reason for hiding this comment

jreback left a comment

Choose a reason for hiding this comment

wesm commented Oct 5, 2017

TomAugspurger commented Oct 5, 2017

jreback Oct 5, 2017

Choose a reason for hiding this comment

TomAugspurger Oct 5, 2017

Choose a reason for hiding this comment

TomAugspurger commented Oct 4, 2017 •

edited

Loading

codecov bot commented Oct 4, 2017 •

edited

Loading