From 84d33b43716fc06a1218518808ac6edbc1d46957 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 30 Mar 2017 17:37:35 -0400 Subject: [PATCH 1/6] ARROW-736: Mixed-type object DataFrame columns should not silently coerce to an Arrow type by default --- cpp/src/arrow/python/CMakeLists.txt | 2 + cpp/src/arrow/python/builtin-convert-test.cc | 60 ++++++++++++++++++++ cpp/src/arrow/python/pandas-test.cc | 4 -- cpp/src/arrow/python/pandas_convert.cc | 47 ++++++++++++--- cpp/src/arrow/python/pandas_convert.h | 4 ++ python/pyarrow/tests/test_convert_builtin.py | 5 ++ python/pyarrow/tests/test_convert_pandas.py | 5 ++ 7 files changed, 116 insertions(+), 11 deletions(-) create mode 100644 cpp/src/arrow/python/builtin-convert-test.cc diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 03f5afc624b34..9f26a502ad926 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -90,4 +90,6 @@ install(FILES if (ARROW_BUILD_TESTS) ADD_ARROW_TEST(pandas-test STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") + ADD_ARROW_TEST(builtin-convert-test + STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") endif() diff --git a/cpp/src/arrow/python/builtin-convert-test.cc b/cpp/src/arrow/python/builtin-convert-test.cc new file mode 100644 index 0000000000000..0c6e272ccf711 --- /dev/null +++ b/cpp/src/arrow/python/builtin-convert-test.cc @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/python/builtin_convert.h" +#include "arrow/table.h" +#include "arrow/test-util.h" + +namespace arrow { +namespace py { + +TEST(BuiltinConversionTest, TestMixedTypeFails) { + PyAcquireGIL lock; + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + + PyObject* list = PyList_New(3); + ASSERT_NE(list, nullptr); + + PyObject* str = PyUnicode_FromString("abc"); + ASSERT_NE(str, nullptr); + + PyObject* integer = PyLong_FromLong(1234L); + ASSERT_NE(integer, nullptr); + + PyObject* doub = PyFloat_FromDouble(123.0234); + ASSERT_NE(doub, nullptr); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, str), 0); + ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); + ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); + + ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr)); + + Py_DECREF(list); +} + +} // namespace py +} // namespace arrow diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc index a4e640b83718b..58eaf323be55d 100644 --- a/cpp/src/arrow/python/pandas-test.cc +++ b/cpp/src/arrow/python/pandas-test.cc @@ -17,16 +17,12 @@ #include "gtest/gtest.h" -#include #include -#include -#include #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/table.h" #include "arrow/test-util.h" -#include "arrow/type.h" #include "arrow/python/common.h" #include "arrow/python/pandas_convert.h" diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 68a8d7d7afcf5..0b730de6b4f20 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -167,8 +167,10 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder, *have_bytes = true; const int32_t length = static_cast(PyBytes_GET_SIZE(obj)); RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length)); + } else if (PyObject_is_null(obj)) { + RETURN_NOT_OK(builder->AppendNull()); } else { - builder->AppendNull(); + return InvalidConversion(obj, "string or bytes"); } } @@ -197,8 +199,10 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width)); RETURN_NOT_OK( builder->Append(reinterpret_cast(PyBytes_AS_STRING(obj)))); + } else if (PyObject_is_null(obj)) { + RETURN_NOT_OK(builder->AppendNull()); } else { - builder->AppendNull(); + return InvalidConversion(obj, "string or bytes"); } } @@ -413,6 +417,28 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* return Status::OK(); } +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { + PyObject* type = PyObject_Type(obj); + RETURN_IF_PYERROR(); + + PyObject* type_name = PyObject_GetAttrString(type, "__name__"); + if (PyErr_Occurred()) { Py_DECREF(type); } + RETURN_IF_PYERROR(); + + Py_ssize_t size; + const char* bytes = PyUnicode_AsUTF8AndSize(type_name, &size); + std::string cpp_type_name(bytes, size); + + std::stringstream ss; + ss << "Python object of type " << cpp_type_name << " is not None and is not a " + << expected_type_name << " object"; + auto s = Status::TypeError(ss.str()); + + Py_DECREF(type_name); + Py_DECREF(type); + return s; +} + Status PandasConverter::ConvertDates(std::shared_ptr* out) { PyAcquireGIL lock; @@ -427,8 +453,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr* out) { if (PyDate_CheckExact(obj)) { PyDateTime_Date* pydate = reinterpret_cast(obj); date_builder.Append(PyDate_to_ms(pydate)); - } else { + } else if (PyObject_is_null(obj)) { date_builder.AppendNull(); + } else { + return InvalidConversion(obj, "date"); } } return date_builder.Finish(out); @@ -483,14 +511,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr* out) { memset(bitmap, 0, nbytes); int64_t null_count = 0; + PyObject* obj; for (int64_t i = 0; i < length_; ++i) { - if (objects[i] == Py_True) { + obj = objects[i]; + if (obj == Py_True) { BitUtil::SetBit(bitmap, i); BitUtil::SetBit(null_bitmap_data_, i); - } else if (objects[i] != Py_False) { + } else if (obj == Py_False) { + BitUtil::SetBit(null_bitmap_data_, i); + } else if (PyObject_is_null(obj)) { ++null_count; } else { - BitUtil::SetBit(null_bitmap_data_, i); + return InvalidConversion(obj, "bool"); } } @@ -551,7 +583,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr* out) { } else if (PyDate_CheckExact(objects[i])) { return ConvertDates(out); } else { - return Status::TypeError("unhandled python type"); + return InvalidConversion( + const_cast(objects[i]), "string, bool, or date"); } } } diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h index 12644d98da156..105c1598d3936 100644 --- a/cpp/src/arrow/python/pandas_convert.h +++ b/cpp/src/arrow/python/pandas_convert.h @@ -24,6 +24,7 @@ #include #include +#include #include "arrow/util/visibility.h" @@ -73,6 +74,9 @@ ARROW_EXPORT Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, const std::shared_ptr& type, std::shared_ptr* out); +ARROW_EXPORT +Status InvalidConversion(PyObject* obj, const std::string& expected_type_name); + } // namespace py } // namespace arrow diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 99251250499d2..3309ba018628d 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -157,3 +157,8 @@ def test_list_of_int(self): assert arr.null_count == 1 assert arr.type == pyarrow.list_(pyarrow.int64()) assert arr.to_pylist() == data + + def test_mixed_types_fails(self): + data = ['a', 1, 2.0] + with self.assertRaises(pyarrow.error.ArrowException): + pyarrow.from_pylist(data) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index f7cb47f685590..3f19b68fe0a03 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -398,3 +398,8 @@ def test_category(self): ] for values in arrays: self._check_array_roundtrip(values) + + def test_mixed_types_fails(self): + data = pd.DataFrame({'a': ['a', 1, 2.0]}) + with self.assertRaises(A.error.ArrowException): + A.Table.from_pandas(data) From b2df3e9e5d045bb0fb9a745165818f326a48732e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 31 Mar 2017 12:58:21 -0400 Subject: [PATCH 2/6] Fix python error handling and make compatible with python27 --- cpp/src/arrow/python/pandas_convert.cc | 35 ++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 0b730de6b4f20..9332b9e9a5cef 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -46,6 +46,7 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" #include "arrow/util/macros.h" namespace arrow { @@ -419,14 +420,37 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { PyObject* type = PyObject_Type(obj); - RETURN_IF_PYERROR(); + + if (type == nullptr) { + DCHECK_NE(PyErr_Occurred(), nullptr) + << "Call to PyObject_Type(...) returned NULL but PyErr_Occurred() was NULL"; + RETURN_IF_PYERROR(); + } PyObject* type_name = PyObject_GetAttrString(type, "__name__"); - if (PyErr_Occurred()) { Py_DECREF(type); } - RETURN_IF_PYERROR(); - Py_ssize_t size; - const char* bytes = PyUnicode_AsUTF8AndSize(type_name, &size); + if (type_name == nullptr) { + Py_DECREF(type); + DCHECK_NE(PyErr_Occurred(), nullptr) << "Call to PyObject_GetAttrString(...) " + "returned NULL but PyErr_Occurred() was NULL"; + RETURN_IF_PYERROR(); + } + + PyObject* bytes_obj = PyUnicode_AsUTF8String(type_name); + + if (bytes_obj == nullptr) { + Py_DECREF(type_name); + Py_DECREF(type); + DCHECK_NE(PyErr_Occurred(), nullptr) << "Call to PyObject_AsUTF8String(...) returned " + "NULL but PyErr_Occurred() was NULL"; + RETURN_IF_PYERROR(); + } + + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj); + const char* bytes = PyBytes_AS_STRING(bytes_obj); + + DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; + std::string cpp_type_name(bytes, size); std::stringstream ss; @@ -434,6 +458,7 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { << expected_type_name << " object"; auto s = Status::TypeError(ss.str()); + Py_DECREF(bytes_obj); Py_DECREF(type_name); Py_DECREF(type); return s; From e80efe10ba491e3551580ea09163773a05bac88c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 31 Mar 2017 23:26:06 -0400 Subject: [PATCH 3/6] Use OwnedRef instead of horror --- cpp/src/arrow/python/pandas_convert.cc | 44 +++++++------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 9332b9e9a5cef..ae9b17ca9ac86 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -419,35 +419,20 @@ inline Status PandasConverter::ConvertData(std::shared_ptr* } Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { - PyObject* type = PyObject_Type(obj); + OwnedRef type(PyObject_Type(obj)); + RETURN_IF_PYERROR(); + DCHECK_NE(type.obj(), nullptr); - if (type == nullptr) { - DCHECK_NE(PyErr_Occurred(), nullptr) - << "Call to PyObject_Type(...) returned NULL but PyErr_Occurred() was NULL"; - RETURN_IF_PYERROR(); - } - - PyObject* type_name = PyObject_GetAttrString(type, "__name__"); - - if (type_name == nullptr) { - Py_DECREF(type); - DCHECK_NE(PyErr_Occurred(), nullptr) << "Call to PyObject_GetAttrString(...) " - "returned NULL but PyErr_Occurred() was NULL"; - RETURN_IF_PYERROR(); - } + OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__")); + RETURN_IF_PYERROR(); + DCHECK_NE(type_name.obj(), nullptr); - PyObject* bytes_obj = PyUnicode_AsUTF8String(type_name); + OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj())); + RETURN_IF_PYERROR(); + DCHECK_NE(bytes_obj.obj(), nullptr); - if (bytes_obj == nullptr) { - Py_DECREF(type_name); - Py_DECREF(type); - DCHECK_NE(PyErr_Occurred(), nullptr) << "Call to PyObject_AsUTF8String(...) returned " - "NULL but PyErr_Occurred() was NULL"; - RETURN_IF_PYERROR(); - } - - Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj); - const char* bytes = PyBytes_AS_STRING(bytes_obj); + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj()); + const char* bytes = PyBytes_AS_STRING(bytes_obj.obj()); DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; @@ -456,12 +441,7 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { std::stringstream ss; ss << "Python object of type " << cpp_type_name << " is not None and is not a " << expected_type_name << " object"; - auto s = Status::TypeError(ss.str()); - - Py_DECREF(bytes_obj); - Py_DECREF(type_name); - Py_DECREF(type); - return s; + return Status::TypeError(ss.str()); } Status PandasConverter::ConvertDates(std::shared_ptr* out) { From 4a18014ac6e0b2ada527b1bff64f0bb7861e555a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 31 Mar 2017 23:29:26 -0400 Subject: [PATCH 4/6] Move test --- cpp/src/arrow/python/builtin-convert-test.cc | 28 ------------------ cpp/src/arrow/python/pandas-test.cc | 31 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/python/builtin-convert-test.cc b/cpp/src/arrow/python/builtin-convert-test.cc index 0c6e272ccf711..70460d56935e9 100644 --- a/cpp/src/arrow/python/builtin-convert-test.cc +++ b/cpp/src/arrow/python/builtin-convert-test.cc @@ -28,33 +28,5 @@ namespace arrow { namespace py { -TEST(BuiltinConversionTest, TestMixedTypeFails) { - PyAcquireGIL lock; - MemoryPool* pool = default_memory_pool(); - std::shared_ptr arr; - - PyObject* list = PyList_New(3); - ASSERT_NE(list, nullptr); - - PyObject* str = PyUnicode_FromString("abc"); - ASSERT_NE(str, nullptr); - - PyObject* integer = PyLong_FromLong(1234L); - ASSERT_NE(integer, nullptr); - - PyObject* doub = PyFloat_FromDouble(123.0234); - ASSERT_NE(doub, nullptr); - - // This steals a reference to each object, so we don't need to decref them later - // just the list - ASSERT_EQ(PyList_SetItem(list, 0, str), 0); - ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); - ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); - - ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr)); - - Py_DECREF(list); -} - } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc index 58eaf323be55d..01e30f5a36ce8 100644 --- a/cpp/src/arrow/python/pandas-test.cc +++ b/cpp/src/arrow/python/pandas-test.cc @@ -19,6 +19,8 @@ #include +#include + #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/table.h" @@ -26,6 +28,7 @@ #include "arrow/python/common.h" #include "arrow/python/pandas_convert.h" +#include "arrow/python/builtin_convert.h" namespace arrow { namespace py { @@ -61,5 +64,33 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) { Py_END_ALLOW_THREADS; } +TEST(BuiltinConversionTest, TestMixedTypeFails) { + PyAcquireGIL lock; + MemoryPool* pool = default_memory_pool(); + std::shared_ptr arr; + + OwnedRef list_ref(PyList_New(3)); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + PyObject* str = PyUnicode_FromString("abc"); + ASSERT_NE(str, nullptr); + + PyObject* integer = PyLong_FromLong(1234L); + ASSERT_NE(integer, nullptr); + + PyObject* doub = PyFloat_FromDouble(123.0234); + ASSERT_NE(doub, nullptr); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, str), 0); + ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); + ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); + + ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr)); +} + } // namespace py } // namespace arrow From bcf623607cee43ceea117ba852c30798c18e5a7b Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 31 Mar 2017 23:30:22 -0400 Subject: [PATCH 5/6] Rename and move --- cpp/src/arrow/python/builtin-convert-test.cc | 32 ------------------- .../python/{pandas-test.cc => python-test.cc} | 0 2 files changed, 32 deletions(-) delete mode 100644 cpp/src/arrow/python/builtin-convert-test.cc rename cpp/src/arrow/python/{pandas-test.cc => python-test.cc} (100%) diff --git a/cpp/src/arrow/python/builtin-convert-test.cc b/cpp/src/arrow/python/builtin-convert-test.cc deleted file mode 100644 index 70460d56935e9..0000000000000 --- a/cpp/src/arrow/python/builtin-convert-test.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gtest/gtest.h" - -#include - -#include "arrow/array.h" -#include "arrow/builder.h" -#include "arrow/python/builtin_convert.h" -#include "arrow/table.h" -#include "arrow/test-util.h" - -namespace arrow { -namespace py { - -} // namespace py -} // namespace arrow diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/python-test.cc similarity index 100% rename from cpp/src/arrow/python/pandas-test.cc rename to cpp/src/arrow/python/python-test.cc From fd09def190b0f0e6e8356e15dd655c040ec83e60 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 31 Mar 2017 23:32:26 -0400 Subject: [PATCH 6/6] Update cmake --- cpp/src/arrow/python/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 9f26a502ad926..faaad89656f92 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -88,8 +88,6 @@ install(FILES # INSTALL_RPATH "\$ORIGIN") if (ARROW_BUILD_TESTS) - ADD_ARROW_TEST(pandas-test + ADD_ARROW_TEST(python-test STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") - ADD_ARROW_TEST(builtin-convert-test - STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") endif()