Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix subword_tokenize error when input contains no tokens #13320

Merged
merged 10 commits into from
May 15, 2023
35 changes: 33 additions & 2 deletions cpp/src/text/subword/subword_tokenize.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,10 +14,13 @@
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sequence.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>

Expand All @@ -31,6 +34,7 @@
#include <thrust/for_each.h>
#include <thrust/functional.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tabulate.h>
#include <thrust/transform_scan.h>

namespace nvtext {
Expand Down Expand Up @@ -125,6 +129,28 @@ __global__ void kernel_compute_tensor_metadata(
}
}

// this happens if there are no tokens in the input
tokenizer_result build_empty_result(cudf::size_type size,
uint32_t max_sequence_length,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
auto ids = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
auto mask = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);

auto metadata = cudf::make_numeric_column(
cudf::data_type{cudf::type_id::UINT32}, size * 3, cudf::mask_state::UNALLOCATED, stream, mr);
thrust::tabulate(rmm::exec_policy(stream),
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
metadata->mutable_view().begin<uint32_t>(),
metadata->mutable_view().end<uint32_t>(),
[] __device__(auto idx) { return ((idx % 3) == 0) ? idx : 0; });
metadata->set_null_count(0);

return tokenizer_result{
0, max_sequence_length, std::move(ids), std::move(mask), std::move(metadata)};
}

} // namespace

tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
Expand All @@ -143,12 +169,13 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
"max_sequence_length x max_rows_tensor is too large for cudf output column size");
auto const strings_count = strings.size();
if (strings_count == 0 || strings.chars_size() == 0)
if (strings_count == strings.null_count()) { // empty or all-null returns empty
return tokenizer_result{0,
max_sequence_length,
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32}),
cudf::make_empty_column(cudf::data_type{cudf::type_id::UINT32})};
}

auto const offsets = strings.offsets();
auto const d_offsets = offsets.data<uint32_t>() + strings.offset();
Expand Down Expand Up @@ -187,6 +214,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
thrust::plus<uint32_t>());
// last element is the total number of output rows
uint32_t const nrows_tensor_token_ids = offsets_per_tensor.element(strings_count, stream);
// if there are no tokens at all, build a specific empty result
if (nrows_tensor_token_ids == 0) {
return build_empty_result(strings_count, max_sequence_length, stream, mr);
}

// compute global_row to tensor, and global_row to within_tensor_row correspondence
rmm::device_uvector<uint32_t> row2tensor(nrows_tensor_token_ids, stream);
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/text/subword/wordpiece_tokenizer.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -498,9 +498,12 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
// We need to change the end_word_indices pointer after the selection is complete
device_end_word_indices = device_start_word_indices + num_words;

cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
detail::
kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
if (num_words > 0) {
cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
detail::kernel_wordpiece_tokenizer<<<grid.num_blocks,
grid.num_threads_per_block,
0,
stream.value()>>>(
device_code_points,
vocab_table.table->view().data<uint64_t>(),
vocab_table.bin_coefficients->view().data<uint64_t>(),
Expand All @@ -515,7 +518,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
num_words,
device_token_ids.data(),
device_tokens_per_word.data());
CUDF_CHECK_CUDA(stream.value());
CUDF_CHECK_CUDA(stream.value());
}

// Repurpose the input array for the token ids. In the worst case, each code point ends up being a
// token so this will always have enough memory to store the contiguous tokens.
Expand Down
71 changes: 71 additions & 0 deletions cpp/tests/text/subword_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,49 @@ TEST(TextSubwordTest, TokenizeMultiRow)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeWithEmptyRow)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

cudf::test::strings_column_wrapper strings{
"This is a test.", "", "This is a test. This is a tést."};
auto input = cudf::strings_column_view{strings};

uint32_t const max_seq = 8;
uint32_t const stride = 6;
bool const lower = true;
bool const truncate = false;

auto result =
nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, MAX_ROWS_TENSOR);

EXPECT_EQ(uint32_t{4}, result.nrows_tensor);

// clang-format off
auto expected_tokens = cudf::test::fixed_width_column_wrapper<uint32_t>(
{2023, 2003, 1037, 3231, 1012, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
2023, 2003, 1037, 3231, 1012, 2023, 2003, 1037, // this one
2003, 1037, 3231, 1012, 0, 0, 0, 0}); // continues here
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
// clang-format off
auto expected_attn = cudf::test::fixed_width_column_wrapper<uint32_t>(
{1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
// clang-format off
auto expected_metadata = cudf::test::fixed_width_column_wrapper<uint32_t>(
{0,0,4, 1,0,0, 2,0,6, 2,1,3}); // note that the 3rd element has 2 tensors
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
{
cudf::test::strings_column_wrapper strings({"This is a test."});
Expand Down Expand Up @@ -236,6 +279,34 @@ TEST(TextSubwordTest, AllNullStrings)
EXPECT_EQ(0, result.tensor_metadata->size());
}

TEST(TextSubwordTest, NoTokens)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
create_hashed_vocab(hash_file);
auto vocab = nvtext::load_vocabulary_file(hash_file);

cudf::test::strings_column_wrapper strings({" ", "\n\r", "\t"});
auto input = cudf::strings_column_view{strings};

uint32_t const max_seq = 16;
uint32_t const stride = 16;
bool const lower = true;
bool const truncate = true;

auto result = nvtext::subword_tokenize(input, *vocab, max_seq, stride, lower, truncate, 2);

std::vector<uint32_t> zeros(max_seq * input.size(), 0);

EXPECT_EQ(static_cast<uint32_t>(input.size()), result.nrows_tensor);

auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(zeros.begin(), zeros.end());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected);
auto expected_metadata =
cudf::test::fixed_width_column_wrapper<uint32_t>({0, 0, 0, 1, 0, 0, 2, 0, 0});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
}

TEST(TextSubwordTest, TokenizeFromVocabStruct)
{
std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
Expand Down