Skip to content

Commit

Permalink
Merge branch 'branch-23.06' into refactor-tests-directory-structure
Browse files Browse the repository at this point in the history
  • Loading branch information
shwina authored Apr 7, 2023
2 parents c4e7438 + f328b64 commit d77a58a
Show file tree
Hide file tree
Showing 20 changed files with 916 additions and 297 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,5 +134,5 @@ jobs:
build_type: pull-request
package-name: dask_cudf
# Install the cudf we just built, and also test against latest dask/distributed/dask-cuda.
test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,5 +98,5 @@ jobs:
sha: ${{ inputs.sha }}
package-name: dask_cudf
# Test against latest dask/distributed/dask-cuda.
test-before: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-before: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
5 changes: 3 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@ dependencies:
- cupy>=9.5.0,<12.0.0a0
- cxx-compiler
- cython>=0.29,<0.30
- dask-core==2023.3.2
- dask-cuda==23.6.*
- dask>=2023.1.1
- distributed>=2023.1.1
- dask==2023.3.2
- distributed==2023.3.2.1
- dlpack>=0.5,<0.6.0a0
- doxygen=1.8.20
- fastavro>=0.22.9
Expand Down
5 changes: 3 additions & 2 deletions conda/recipes/custreamz/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ requirements:
- python
- streamz
- cudf ={{ version }}
- dask >=2023.1.1
- distributed >=2023.1.1
- dask ==2023.3.2
- dask-core ==2023.3.2
- distributed ==2023.3.2.1
- python-confluent-kafka >=1.7.0,<1.8.0a0
- cudf_kafka ={{ version }}

Expand Down
10 changes: 6 additions & 4 deletions conda/recipes/dask-cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,16 @@ requirements:
host:
- python
- cudf ={{ version }}
- dask >=2023.1.1
- distributed >=2023.1.1
- dask ==2023.3.2
- dask-core ==2023.3.2
- distributed ==2023.3.2.1
- cudatoolkit ={{ cuda_version }}
run:
- python
- cudf ={{ version }}
- dask >=2023.1.1
- distributed >=2023.1.1
- dask ==2023.3.2
- dask-core ==2023.3.2
- distributed ==2023.3.2.1
- {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

test:
Expand Down
8 changes: 4 additions & 4 deletions conda/recipes/dask-cudf/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@ if [ "${ARCH}" = "aarch64" ]; then
fi

# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
export INSTALL_DASK_MAIN=1
export INSTALL_DASK_MAIN=0

# Dask version to install when `INSTALL_DASK_MAIN=0`
export DASK_STABLE_VERSION="2023.1.1"
export DASK_STABLE_VERSION="2023.3.2"

# Install the conda-forge or nightly version of dask and distributed
if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
else
rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed==2023.3.2.1 conda-forge::dask-core==2023.3.2 --force-reinstall"
rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=="2023.3.2.1" conda-forge::dask-core=="2023.3.2" --force-reinstall
fi

logger "python -c 'import dask_cudf'"
Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/io/orc/orc_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ constexpr int64_t data_size = 512 << 20;
// Each call reads roughly equal amounts of data
constexpr int32_t chunked_read_num_chunks = 8;

std::vector<std::string> get_col_names(cudf::io::source_info const& source)
std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
auto const top_lvl_cols = cudf::io::read_orc_metadata(source).schema().root().children();
std::vector<std::string> col_names;
Expand Down Expand Up @@ -79,7 +79,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
cudf::io::write_orc(options);

auto const cols_to_read =
select_column_names(get_col_names(source_sink.make_source_info()), ColSelection);
select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection);
cudf::io::orc_reader_options read_options =
cudf::io::orc_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
Expand Down
5 changes: 2 additions & 3 deletions cpp/benchmarks/io/parquet/parquet_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
constexpr std::size_t data_size = 512 << 20;
constexpr std::size_t row_group_size = 128 << 20;

std::vector<std::string> get_col_names(cudf::io::source_info const& source)
std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
cudf::io::parquet_reader_options const read_options =
cudf::io::parquet_reader_options::builder(source);
Expand All @@ -39,7 +39,6 @@ std::vector<std::string> get_col_names(cudf::io::source_info const& source)
std::vector<std::string> names;
names.reserve(schema.size());
std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
CUDF_EXPECTS(c.children.empty(), "nested types are not supported");
return c.name;
});
return names;
Expand Down Expand Up @@ -81,7 +80,7 @@ void BM_parquet_read_options(nvbench::state& state,
cudf::io::write_parquet(options);

auto const cols_to_read =
select_column_names(get_col_names(source_sink.make_source_info()), ColSelection);
select_column_names(get_top_level_col_names(source_sink.make_source_info()), ColSelection);
cudf::io::parquet_reader_options read_options =
cudf::io::parquet_reader_options::builder(source_sink.make_source_info())
.columns(cols_to_read)
Expand Down
19 changes: 9 additions & 10 deletions cpp/include/cudf/detail/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
template <typename T>
class default_allocator;

namespace cudf::structs::detail {
class flattened_table;
namespace cudf::experimental::row::equality {
class preprocessed_table;
}

namespace cudf {
Expand Down Expand Up @@ -77,9 +77,9 @@ struct hash_join {
rmm::device_buffer const _composite_bitmask; ///< Bitmask to denote whether a row is valid
cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal
cudf::table_view _build; ///< input table to build the hash map
std::unique_ptr<cudf::structs::detail::flattened_table>
_flattened_build_table; ///< flattened data structures for `_build`
map_type _hash_table; ///< hash table built on `_build`
std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
_preprocessed_build; ///< input table preprocssed for row operators
map_type _hash_table; ///< hash table built on `_build`

public:
/**
Expand Down Expand Up @@ -152,21 +152,20 @@ struct hash_join {
* i.e. if full join is specified as the join type then left join is called. Behavior
* is undefined if the provided `output_size` is smaller than the actual output size.
*
* @throw cudf::logic_error if build table is empty and `JoinKind == INNER_JOIN`.
*
* @tparam JoinKind The type of join to be performed.
* @throw cudf::logic_error if build table is empty and `join == INNER_JOIN`.
*
* @param probe_table Table of probe side columns to join.
* @param join The type of join to be performed.
* @param output_size Optional value which allows users to specify the exact output size.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned vectors.
*
* @return Join output indices vector pair.
*/
template <cudf::detail::join_kind JoinKind>
std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
probe_join_indices(cudf::table_view const& probe_table,
join_kind join,
std::optional<std::size_t> output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const;
Expand All @@ -179,10 +178,10 @@ struct hash_join {
* @throw cudf::logic_error if the number of columns in build table and probe table do not match.
* @throw cudf::logic_error if the column data types in build table and probe table do not match.
*/
template <cudf::detail::join_kind JoinKind>
std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
std::unique_ptr<rmm::device_uvector<size_type>>>
compute_hash_join(cudf::table_view const& probe,
join_kind join,
std::optional<std::size_t> output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
const auto num_columns = columns_level.size();
cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
total_num_stripes, num_columns, stream);
memset(chunks.base_host_ptr(), 0, chunks.memory_size());
memset(chunks.base_host_ptr(), 0, chunks.size_bytes());

const bool use_index =
_use_index &&
Expand Down
Loading

0 comments on commit d77a58a

Please sign in to comment.