Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-736: [Python] Mixed-type object DataFrame columns should not silently co… #465

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,6 @@ install(FILES
# INSTALL_RPATH "\$ORIGIN")

if (ARROW_BUILD_TESTS)
ADD_ARROW_TEST(pandas-test
ADD_ARROW_TEST(python-test
STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}")
endif()
52 changes: 45 additions & 7 deletions cpp/src/arrow/python/pandas_convert.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"

namespace arrow {
Expand Down Expand Up @@ -167,8 +168,10 @@ Status AppendObjectStrings(int64_t objects_length, StringBuilder* builder,
*have_bytes = true;
const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(obj));
RETURN_NOT_OK(builder->Append(PyBytes_AS_STRING(obj), length));
} else if (PyObject_is_null(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else {
builder->AppendNull();
return InvalidConversion(obj, "string or bytes");
}
}

Expand Down Expand Up @@ -197,8 +200,10 @@ static Status AppendObjectFixedWidthBytes(int64_t objects_length, int byte_width
RETURN_NOT_OK(CheckPythonBytesAreFixedLength(obj, byte_width));
RETURN_NOT_OK(
builder->Append(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj))));
} else if (PyObject_is_null(obj)) {
RETURN_NOT_OK(builder->AppendNull());
} else {
builder->AppendNull();
return InvalidConversion(obj, "string or bytes");
}
}

Expand Down Expand Up @@ -413,6 +418,32 @@ inline Status PandasConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>*
return Status::OK();
}

Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) {
OwnedRef type(PyObject_Type(obj));
RETURN_IF_PYERROR();
DCHECK_NE(type.obj(), nullptr);

OwnedRef type_name(PyObject_GetAttrString(type.obj(), "__name__"));
RETURN_IF_PYERROR();
DCHECK_NE(type_name.obj(), nullptr);

OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj()));
RETURN_IF_PYERROR();
DCHECK_NE(bytes_obj.obj(), nullptr);

Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj());
const char* bytes = PyBytes_AS_STRING(bytes_obj.obj());

DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null";

std::string cpp_type_name(bytes, size);

std::stringstream ss;
ss << "Python object of type " << cpp_type_name << " is not None and is not a "
<< expected_type_name << " object";
return Status::TypeError(ss.str());
}

Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
PyAcquireGIL lock;

Expand All @@ -427,8 +458,10 @@ Status PandasConverter::ConvertDates(std::shared_ptr<Array>* out) {
if (PyDate_CheckExact(obj)) {
PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(obj);
date_builder.Append(PyDate_to_ms(pydate));
} else {
} else if (PyObject_is_null(obj)) {
date_builder.AppendNull();
} else {
return InvalidConversion(obj, "date");
}
}
return date_builder.Finish(out);
Expand Down Expand Up @@ -483,14 +516,18 @@ Status PandasConverter::ConvertBooleans(std::shared_ptr<Array>* out) {
memset(bitmap, 0, nbytes);

int64_t null_count = 0;
PyObject* obj;
for (int64_t i = 0; i < length_; ++i) {
if (objects[i] == Py_True) {
obj = objects[i];
if (obj == Py_True) {
BitUtil::SetBit(bitmap, i);
BitUtil::SetBit(null_bitmap_data_, i);
} else if (objects[i] != Py_False) {
} else if (obj == Py_False) {
BitUtil::SetBit(null_bitmap_data_, i);
} else if (PyObject_is_null(obj)) {
++null_count;
} else {
BitUtil::SetBit(null_bitmap_data_, i);
return InvalidConversion(obj, "bool");
}
}

Expand Down Expand Up @@ -551,7 +588,8 @@ Status PandasConverter::ConvertObjects(std::shared_ptr<Array>* out) {
} else if (PyDate_CheckExact(objects[i])) {
return ConvertDates(out);
} else {
return Status::TypeError("unhandled python type");
return InvalidConversion(
const_cast<PyObject*>(objects[i]), "string, bool, or date");
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/python/pandas_convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <Python.h>

#include <memory>
#include <string>

#include "arrow/util/visibility.h"

Expand Down Expand Up @@ -73,6 +74,9 @@ ARROW_EXPORT
Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo,
const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out);

ARROW_EXPORT
Status InvalidConversion(PyObject* obj, const std::string& expected_type_name);

} // namespace py
} // namespace arrow

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@

#include "gtest/gtest.h"

#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include <Python.h>

#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
#include "arrow/type.h"

#include "arrow/python/common.h"
#include "arrow/python/pandas_convert.h"
#include "arrow/python/builtin_convert.h"

namespace arrow {
namespace py {
Expand Down Expand Up @@ -65,5 +64,33 @@ TEST(PandasConversionTest, TestObjectBlockWriteFails) {
Py_END_ALLOW_THREADS;
}

TEST(BuiltinConversionTest, TestMixedTypeFails) {
PyAcquireGIL lock;
MemoryPool* pool = default_memory_pool();
std::shared_ptr<Array> arr;

OwnedRef list_ref(PyList_New(3));
PyObject* list = list_ref.obj();

ASSERT_NE(list, nullptr);

PyObject* str = PyUnicode_FromString("abc");
ASSERT_NE(str, nullptr);

PyObject* integer = PyLong_FromLong(1234L);
ASSERT_NE(integer, nullptr);

PyObject* doub = PyFloat_FromDouble(123.0234);
ASSERT_NE(doub, nullptr);

// This steals a reference to each object, so we don't need to decref them later
// just the list
ASSERT_EQ(PyList_SetItem(list, 0, str), 0);
ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);

ASSERT_RAISES(UnknownError, ConvertPySequence(list, pool, &arr));
}

} // namespace py
} // namespace arrow
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,8 @@ def test_list_of_int(self):
assert arr.null_count == 1
assert arr.type == pyarrow.list_(pyarrow.int64())
assert arr.to_pylist() == data

def test_mixed_types_fails(self):
data = ['a', 1, 2.0]
with self.assertRaises(pyarrow.error.ArrowException):
pyarrow.from_pylist(data)
5 changes: 5 additions & 0 deletions python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,3 +398,8 @@ def test_category(self):
]
for values in arrays:
self._check_array_roundtrip(values)

def test_mixed_types_fails(self):
data = pd.DataFrame({'a': ['a', 1, 2.0]})
with self.assertRaises(A.error.ArrowException):
A.Table.from_pandas(data)