Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to support Numpy >= 1.24 #13325

Merged
merged 22 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ dependencies:
- ninja
- notebook
- numba>=0.57
- numpy>=1.21,<1.24
- numpy>=1.21
- numpydoc
- nvcc_linux-64=11.8
- nvtx>=0.2.1
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ requirements:
- pandas >=1.3,<1.6.0dev0
- cupy >=12.0.0
- numba >=0.57
- numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations
- numpy >=1.21
- {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
- libcudf {{ version }}
- {{ pin_compatible('rmm', max_pin='x.x') }}
Expand Down
4 changes: 2 additions & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ dependencies:
# Hard pin the patch version used during the build. This must be kept
# in sync with the version pinned in get_arrow.cmake.
- pyarrow==11.0.0.*
- numpy>=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations
- numpy>=1.21
build_python:
common:
- output_types: [conda, requirements, pyproject]
Expand Down Expand Up @@ -342,7 +342,7 @@ dependencies:
- output_types: [conda, requirements, pyproject]
packages:
- fsspec>=0.6.0
- numpy>=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations
- numpy>=1.21
- pandas>=1.3,<1.6.0dev0
run_cudf:
common:
Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,10 +768,12 @@ def _normalize_find_and_replace_input(
if len(col_to_normalize) == 1:
if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
return normalized_column.astype(input_column_dtype)
else:
col_to_normalize_casted = input_column_dtype.type(
col_to_normalize[0]
)
if np.isinf(col_to_normalize[0]):
return normalized_column
col_to_normalize_casted = np.array(col_to_normalize[0]).astype(
input_column_dtype
)

if not np.isnan(col_to_normalize_casted) and (
col_to_normalize_casted != col_to_normalize[0]
):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,8 @@ def test_column_view_string_slice(slc):
cudf.core.column.as_column([], dtype="uint8"),
),
(
cp.array([453], dtype="uint8"),
cudf.core.column.as_column([453], dtype="uint8"),
cp.array([255], dtype="uint8"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it doesn't matter what value we choose here? Just wondering if it's important to use 453-256.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it matters. I use 255 just because it's `np.iinfo(uint8).max'

cudf.core.column.as_column([255], dtype="uint8"),
),
],
)
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def make_all_numeric_extremes_dataframe():
if np.issubdtype(np_type, np.integer):
itype = np.iinfo(np_type)
extremes = [0, +1, -1, itype.min, itype.max]
df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20]
df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20]
else:
ftype = np.finfo(np_type)
extremes = [
Expand Down Expand Up @@ -324,6 +324,7 @@ def test_csv_reader_dtype_dict(use_names):
assert_eq(gdf, pdf)


@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
@pytest.mark.parametrize("use_names", [True, False])
def test_csv_reader_dtype_extremes(use_names):
# Save with the column header if not explicitly specifying a list of names
Expand Down Expand Up @@ -1433,7 +1434,7 @@ def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype):

gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"])

expected = np.array(values, dtype=np_dtype)
expected = np.array(values).astype(np_dtype)
actual = gdf["hex_int"].to_numpy()
np.testing.assert_array_equal(expected, actual)

Expand Down Expand Up @@ -2149,6 +2150,7 @@ def test_default_integer_bitwidth_partial(
)


@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
def test_default_integer_bitwidth_extremes(
cudf_extreme_numeric_dataframe, default_integer_bitwidth
):
Expand Down
12 changes: 4 additions & 8 deletions python/cudf/cudf/tests/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,19 @@
@pytest.fixture(params=[0, 1, 10, 100])
def pdf(request):
types = NUMERIC_TYPES + ["bool"]
typer = {"col_" + val: val for val in types}
ncols = len(types)
nrows = request.param

# Create a pandas dataframe with random data of mixed types
test_pdf = pd.DataFrame(
[list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
{
f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
for typ in types
}
)
# Delete the name of the column index, and rename the row index
test_pdf.columns.name = None
test_pdf.index.name = "index"

# Cast all the column dtypes to objects, rename them, and then cast to
# appropriate types
test_pdf = test_pdf.astype("object").astype(typer)

# Create non-numeric categorical data otherwise may get typecasted
data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)]
test_pdf["col_category"] = pd.Series(data, dtype="category")
Expand Down
12 changes: 4 additions & 8 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,19 @@ def make_numeric_dataframe(nrows, dtype):
@pytest.fixture(params=[0, 1, 10, 100])
def pdf(request):
types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"]
typer = {"col_" + val: val for val in types}
ncols = len(types)
nrows = request.param

# Create a pandas dataframe with random data of mixed types
test_pdf = pd.DataFrame(
[list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
{
f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
for typ in types
}
)
# Delete the name of the column index, and rename the row index
test_pdf.columns.name = None
test_pdf.index.name = "test_index"

# Cast all the column dtypes to objects, rename them, and then cast to
# appropriate types
test_pdf = test_pdf.astype("object").astype(typer)

return test_pdf


Expand Down
6 changes: 5 additions & 1 deletion python/cudf/cudf/tests/test_numerical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
# Copyright (c) 2021-2023, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -194,6 +194,7 @@ def test_to_numeric_downcast_int(data, downcast):
assert_eq(expected, got)


@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of applying this to the whole test, can we just wrap the pd.to_numeric call? This doesn't affect the cudf.to_numeric call, does it?

Also, should we be handling the warning conditionally? i.e. I assuming this happens when trying to downcast a signed to an unsigned type or something?

@pytest.mark.parametrize(
"data",
[
Expand Down Expand Up @@ -223,6 +224,7 @@ def test_to_numeric_downcast_float(data, downcast):
assert_eq(expected, got)


@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
@pytest.mark.parametrize(
"data",
[
Expand All @@ -245,6 +247,7 @@ def test_to_numeric_downcast_large_float(data, downcast):
assert_eq(expected, got)


@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
@pytest.mark.parametrize(
"data",
[
Expand Down Expand Up @@ -325,6 +328,7 @@ def test_to_numeric_downcast_string_float(data, downcast):
assert_eq(expected, got)


@pytest.mark.filterwarnings("ignore:overflow encountered in cast")
@pytest.mark.parametrize(
"data",
[
Expand Down
24 changes: 8 additions & 16 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ def simple_pdf(request):
"float32",
"float64",
]
typer = {"col_" + val: val for val in types}
ncols = len(types)
nrows = request.param

# Create a pandas dataframe with random data of mixed types
test_pdf = pd.DataFrame(
[list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
{
f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
for typ in types
},
# Need to ensure that this index is not a RangeIndex to get the
# expected round-tripping behavior from Parquet reader/writer.
index=pd.Index(list(range(nrows))),
Expand All @@ -85,10 +85,6 @@ def simple_pdf(request):
test_pdf.columns.name = None
test_pdf.index.name = "test_index"

# Cast all the column dtypes to objects, rename them, and then cast to
# appropriate types
test_pdf = test_pdf.astype("object").astype(typer)
wence- marked this conversation as resolved.
Show resolved Hide resolved

return test_pdf


Expand All @@ -115,14 +111,14 @@ def build_pdf(num_columns, day_resolution_timestamps):
"datetime64[us]",
"str",
]
typer = {"col_" + val: val for val in types}
ncols = len(types)
nrows = num_columns.param

# Create a pandas dataframe with random data of mixed types
test_pdf = pd.DataFrame(
[list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)],
columns=pd.Index([f"col_{typ}" for typ in types], name="foo"),
{
f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ)
for typ in types
},
# Need to ensure that this index is not a RangeIndex to get the
# expected round-tripping behavior from Parquet reader/writer.
index=pd.Index(list(range(nrows))),
Expand All @@ -131,10 +127,6 @@ def build_pdf(num_columns, day_resolution_timestamps):
test_pdf.columns.name = None
test_pdf.index.name = "test_index"

# Cast all the column dtypes to objects, rename them, and then cast to
# appropriate types
test_pdf = test_pdf.astype(typer)
wence- marked this conversation as resolved.
Show resolved Hide resolved

# make datetime64's a little more interesting by increasing the range of
# dates note that pandas will convert these to ns timestamps, so care is
# taken to avoid overflowing a ns timestamp. There is also the ability to
Expand Down
12 changes: 6 additions & 6 deletions python/cudf/cudf/tests/test_rank.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from itertools import chain, combinations_with_replacement, product

Expand Down Expand Up @@ -128,6 +128,7 @@ def test_rank_error_arguments(pdf):
sort_dtype_args = [np.int32, np.int64, np.float32, np.float64]


@pytest.mark.filterwarnings("ignore:invalid value encountered in cast")
@pytest.mark.parametrize(
"elem,dtype",
list(
Expand All @@ -139,13 +140,12 @@ def test_rank_error_arguments(pdf):
)
def test_series_rank_combinations(elem, dtype):
np.random.seed(0)
aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype)
gdf = DataFrame()
gdf["a"] = aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(
dtype
)
ranked_gs = gdf["a"].rank(method="first")
df = pd.DataFrame()
gdf["a"] = aa
df["a"] = aa
ranked_gs = gdf["a"].rank(method="first")
ranked_ps = df["a"].rank(method="first")
# Check
assert_eq(ranked_ps, ranked_gs.to_pandas())
assert_eq(ranked_ps, ranked_gs)
13 changes: 10 additions & 3 deletions python/cudf/cudf/tests/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,8 +944,15 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
sr = cudf.from_pandas(psr)

if sr.dtype.kind in "ui":
can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
int(replacement), sr.dtype
)
else:
can_replace = np.can_cast(replacement, sr.dtype)

# Both Scalar
if sr.dtype.type(replacement) != replacement:
if not can_replace:
with pytest.raises(TypeError):
sr.replace(1, replacement)
else:
Expand All @@ -954,7 +961,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
assert_eq(expect, got)

# to_replace is a list, replacement is a scalar
if sr.dtype.type(replacement) != replacement:
if not can_replace:
with pytest.raises(TypeError):

sr.replace([2, 3], replacement)
Expand All @@ -974,7 +981,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
# Both lists of equal length
if (
np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"}
) or (sr.dtype.type(replacement) != replacement):
) or (not can_replace):
with pytest.raises(TypeError):
sr.replace([2, 3], [replacement, replacement])
else:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_sparse_df.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
# Copyright (c) 2018-2023, NVIDIA CORPORATION.

import numpy as np

Expand All @@ -7,7 +7,7 @@

def test_to_dense_array():
data = np.random.random(8)
mask = np.asarray([0b11010110], dtype=np.byte)
mask = np.asarray([0b11010110]).astype(np.byte)

sr = Series.from_masked_array(data=data, mask=mask, null_count=3)
assert sr.has_nulls
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/tests/test_unaops.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

import itertools
import operator
Expand Down Expand Up @@ -77,9 +77,10 @@ def generate_valid_scalar_unaop_combos():
return results


@pytest.mark.filterwarnings("ignore:overflow encountered in scalar negative")
@pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
def test_scalar_unary_operations(slr, dtype, op):
slr_host = cudf.dtype(dtype).type(slr)
slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
slr_device = cudf.Scalar(slr, dtype=dtype)

expect = op(slr_host)
Expand Down
3 changes: 2 additions & 1 deletion python/cudf/cudf/utils/queryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def query_compile(expr):
key "args" is a sequence of name of the arguments.
"""

funcid = f"queryexpr_{np.uintp(hash(expr)):x}"
# hash returns in the semi-open interval [-2**63, 2**63)
funcid = f"queryexpr_{(hash(expr) + 2**63):x}"
# Load cache
compiled = _cache.get(funcid)
# Cache not found
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ requires = [
"cmake>=3.23.1,!=3.25.0",
"cython>=0.29,<0.30",
"ninja",
"numpy>=1.21,<1.24",
"numpy>=1.21",
"protoc-wheel",
"pyarrow==11.0.0.*",
"rmm==23.6.*",
Expand All @@ -32,7 +32,7 @@ dependencies = [
"cupy-cuda11x>=12.0.0",
"fsspec>=0.6.0",
"numba>=0.57",
"numpy>=1.21,<1.24",
"numpy>=1.21",
"nvtx>=0.2.1",
"packaging",
"pandas>=1.3,<1.6.0dev0",
Expand Down
2 changes: 1 addition & 1 deletion python/cudf_kafka/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

requires = [
"cython>=0.29,<0.30",
"numpy>=1.21,<1.24",
"numpy>=1.21",
"pyarrow==11.0.0.*",
"setuptools",
"wheel",
Expand Down
2 changes: 1 addition & 1 deletion python/dask_cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ dependencies = [
"dask==2023.3.2",
"distributed==2023.3.2.1",
"fsspec>=0.6.0",
"numpy>=1.21,<1.24",
"numpy>=1.21",
"pandas>=1.3,<1.6.0dev0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
classifiers = [
Expand Down