Skip to content

Commit

Permalink
Simplify TTL value fetch for ttl column
Browse files Browse the repository at this point in the history
Upstream commit ID : fb-mysql-5.6.35/77032004ad23d21a4c386f8136ecfbb071ea42d6
PS-6865 : Merge fb-prod201903

Summary:
Currently during primary key's value encode, its ttl value can be from either
one of these 3 cases
1. ttl column in primary key
2. non-ttl column
   a. old record(update case)
   b. current timestamp
3. ttl column in non-key field

Workflow #1: first in Rdb_key_def::pack_record() find and
store pk_offset, then in value encode try to parse key slice to fetch ttl
value by using pk_offset.

Workflow #3: fetch ttl value from ttl column

The change is to merge #1 and #3 by always fetching TTL value from ttl column,
not matter whether the ttl column is in primary key or not. Of course, remove
pk_offset, since it isn't used.

BTW, for secondary keys, its ttl value is always from m_ttl_bytes, which is
stored by primary value encoding.

Reviewed By: yizhang82

Differential Revision: D14662716

fbshipit-source-id: 6b4e5f044fd
  • Loading branch information
luqun authored and inikep committed Sep 9, 2024
1 parent 487385a commit 49b9040
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 101 deletions.
95 changes: 33 additions & 62 deletions storage/rocksdb/ha_rocksdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5649,48 +5649,47 @@ int ha_rocksdb::convert_record_to_storage_format(
Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info;
bool has_ttl = m_pk_descr->has_ttl();
bool has_ttl_column = !m_pk_descr->m_ttl_column.empty();
bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX);
int null_bytes_in_record = m_converter->get_null_bytes_in_record();

m_storage_record.length(0);

if (has_ttl) {
/* If it's a TTL record, reserve space for 8 byte TTL value in front. */
m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + null_bytes_in_record, 0);
// NOTE: m_ttl_bytes_updated is only used for update case
// During update, skip update sk key/values slice iff none of sk fields
// have changed and ttl bytes isn't changed. see
// ha_rocksdb::update_write_sk() for more info
m_ttl_bytes_updated = false;

/*
If the TTL is contained within the key, we use the offset to find the
TTL value and place it in the beginning of the value record.
If the TTL is contained within table columns, we use the field index to
find the TTL value and place it in the beginning of the value record.
*/
if (ttl_in_pk) {
Rdb_string_reader reader(&pk_packed_slice);
const char *ts;
if (!reader.read(row_info.ttl_pk_offset) ||
!(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) {
std::string buf;
buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(),
RDB_MAX_HEXDUMP_LEN);
const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id();
LogPluginErrMsg(ERROR_LEVEL, 0,
"Decoding ttl from PK failed during insert, for index "
"(%u,%u), key: %s",
gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
return HA_EXIT_FAILURE;
}
if (has_ttl_column) {
uint ttl_field_index = m_pk_descr->get_ttl_field_index();
DBUG_ASSERT(ttl_field_index != UINT_MAX);

Field *const field = table->field[ttl_field_index];
DBUG_ASSERT(field->pack_length_in_rec() == ROCKSDB_SIZEOF_TTL_RECORD);
DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG);

char *const data = const_cast<char *>(m_storage_record.ptr());
memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD);
uint64 ts = uint8korr(field->ptr);
#if !defined(DBUG_OFF)
// Adjust for test case if needed
rdb_netbuf_store_uint64(
reinterpret_cast<uchar *>(data),
rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(data)) +
rdb_dbug_set_ttl_rec_ts());
ts += rdb_dbug_set_ttl_rec_ts();
#endif // !defined(DBUG_OFF)
// Also store in m_ttl_bytes to propagate to update_write_sk
rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);

// If this is an update and the timestamp has been updated, take note
// so we can avoid updating SKs unnecessarily.
if (!row_info.old_pk_slice.empty()) {
m_ttl_bytes_updated =
memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
}
// Store timestamp in m_ttl_bytes to propagate to update_write_sk
memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
} else if (!has_ttl_column) {
} else {
/*
For implicitly generated TTL records we need to copy over the old
TTL value from the old record in the event of an update. It was stored
Expand Down Expand Up @@ -5773,35 +5772,8 @@ int ha_rocksdb::convert_record_to_storage_format(
field_var->length_bytes + data_len);
} else {
/* Copy the field data */
const uint len = field->pack_length_in_rec();
m_storage_record.append(reinterpret_cast<char *>(field->ptr), len);

/*
Check if this is the TTL field within the table, if so store the TTL
in the front of the record as well here.
*/
if (has_ttl && has_ttl_column &&
i == m_pk_descr->get_ttl_field_offset()) {
DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD);
DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG);
DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX);

char *const data = const_cast<char *>(m_storage_record.ptr());
uint64 ts = uint8korr(field->ptr);
#if !defined(DBUG_OFF)
ts += rdb_dbug_set_ttl_rec_ts();
#endif // !defined(DBUG_OFF)
rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);

// If this is an update and the timestamp has been updated, take note
// so we can avoid updating SKs unnecessarily.
if (!row_info.old_pk_slice.empty()) {
m_ttl_bytes_updated =
memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
}
// Store timestamp in m_ttl_bytes to propagate to update_write_sk
memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
}
m_storage_record.append(reinterpret_cast<char *>(field->ptr),
field->pack_length_in_rec());
}
}

Expand Down Expand Up @@ -8799,10 +8771,9 @@ int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {

row_info->new_pk_unpack_info = &m_pk_unpack_info;

size =
m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data,
m_pk_packed_tuple, row_info->new_pk_unpack_info,
false, 0, 0, nullptr, &row_info->ttl_pk_offset);
size = m_pk_descr->pack_record(
table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple,
row_info->new_pk_unpack_info, false, 0, 0, nullptr);
} else if (row_info->old_data == nullptr) {
row_info->hidden_pk_id = update_hidden_pk_val();
size =
Expand Down Expand Up @@ -9314,14 +9285,14 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg,
new_packed_size =
kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes);
row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes);

if (row_info.old_data != nullptr) {
// The old value
old_packed_size = kd.pack_record(
table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
&m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
nullptr, nullptr, m_ttl_bytes);
nullptr, m_ttl_bytes);

/*
Check if we are going to write the same value. This can happen when
Expand Down Expand Up @@ -11763,7 +11734,7 @@ int ha_rocksdb::inplace_populate_sk(
const int new_packed_size = index->pack_record(
new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
&m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
nullptr, nullptr, m_ttl_bytes);
nullptr, m_ttl_bytes);

const rocksdb::Slice key = rocksdb::Slice(
reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
Expand Down
10 changes: 0 additions & 10 deletions storage/rocksdb/ha_rocksdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -643,16 +643,6 @@ class ha_rocksdb : public my_core::handler {

longlong hidden_pk_id;
bool skip_unique_check;

// In certain cases, TTL is enabled on a table, as well as an explicit TTL
// column. The TTL column can be part of either the key or the value part
// of the record. If it is part of the key, we store the offset here.
//
// Later on, we use this offset to store the TTL in the value part of the
// record, which we can then access in the compaction filter.
//
// Set to UINT_MAX by default to indicate that the TTL is not in key.
uint ttl_pk_offset = UINT_MAX;
};

/*
Expand Down
41 changes: 16 additions & 25 deletions storage/rocksdb/rdb_datadic.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ Rdb_key_def::Rdb_key_def(uint indexnr_arg, uint keyno_arg,
m_ttl_rec_offset(ttl_rec_offset), m_ttl_duration(ttl_duration),
m_ttl_column(""), m_pk_part_no(nullptr), m_pack_info(nullptr),
m_keyno(keyno_arg), m_key_parts(0), m_ttl_pk_key_part_offset(UINT_MAX),
m_ttl_field_offset(UINT_MAX), m_prefix_extractor(nullptr),
m_ttl_field_index(UINT_MAX), m_prefix_extractor(nullptr),
m_maxlength(0) // means 'not intialized'
{
mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
Expand All @@ -120,7 +120,7 @@ Rdb_key_def::Rdb_key_def(const Rdb_key_def &k)
m_ttl_column(k.m_ttl_column), m_pk_part_no(k.m_pk_part_no),
m_pack_info(nullptr), m_keyno(k.m_keyno), m_key_parts(k.m_key_parts),
m_ttl_pk_key_part_offset(k.m_ttl_pk_key_part_offset),
m_ttl_field_offset(UINT_MAX), m_prefix_extractor(k.m_prefix_extractor),
m_ttl_field_index(UINT_MAX), m_prefix_extractor(k.m_prefix_extractor),
m_maxlength(k.m_maxlength) {
mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
rdb_netbuf_store_index(m_index_number_storage_form, m_index_number);
Expand Down Expand Up @@ -250,7 +250,7 @@ void Rdb_key_def::setup(const TABLE *const tbl,
table creation.
*/
Rdb_key_def::extract_ttl_col(tbl, tbl_def, &m_ttl_column,
&m_ttl_field_offset, true);
&m_ttl_field_index, true);

size_t max_len = INDEX_NUMBER_SIZE;
int unpack_len = 0;
Expand Down Expand Up @@ -429,7 +429,7 @@ uint Rdb_key_def::extract_ttl_duration(const TABLE *const table_arg,
uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg,
std::string *ttl_column,
uint *ttl_field_offset, bool skip_checks) {
uint *ttl_field_index, bool skip_checks) {
std::string table_comment(table_arg->s->comment.str,
table_arg->s->comment.length);
/*
Expand All @@ -448,7 +448,7 @@ uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg,
if (my_strcasecmp(system_charset_info, field->field_name,
ttl_col_str.c_str()) == 0) {
*ttl_column = ttl_col_str;
*ttl_field_offset = i;
*ttl_field_index = i;
}
}
return HA_EXIT_SUCCESS;
Expand All @@ -465,7 +465,7 @@ uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg,
field->key_type() == HA_KEYTYPE_ULONGLONG &&
!field->real_maybe_null()) {
*ttl_column = ttl_col_str;
*ttl_field_offset = i;
*ttl_field_index = i;
found = true;
break;
}
Expand Down Expand Up @@ -1050,8 +1050,8 @@ uchar *Rdb_key_def::pack_field(Field *const field, Rdb_field_packing *pack_info,
unpack_info_len OUT Unpack data length
n_key_parts Number of keyparts to process. 0 means all of them.
n_null_fields OUT Number of key fields with NULL value.
ttl_pk_offset OUT Offset of the ttl column if specified and in the key
ttl_bytes IN Previous ttl bytes from old record for update case or
current ttl bytes from just packed primary key/value
@detail
Some callers do not need the unpack information, they can pass
unpack_info=nullptr, unpack_info_len=nullptr.
Expand All @@ -1060,12 +1060,14 @@ uchar *Rdb_key_def::pack_field(Field *const field, Rdb_field_packing *pack_info,
Length of the packed tuple
*/

uint Rdb_key_def::pack_record(
const TABLE *const tbl, uchar *const pack_buffer, const uchar *const record,
uchar *const packed_tuple, Rdb_string_writer *const unpack_info,
const bool should_store_row_debug_checksums, const longlong hidden_pk_id,
uint n_key_parts, uint *const n_null_fields, uint *const ttl_pk_offset,
const char *const ttl_bytes) const {
uint Rdb_key_def::pack_record(const TABLE *const tbl, uchar *const pack_buffer,
const uchar *const record,
uchar *const packed_tuple,
Rdb_string_writer *const unpack_info,
const bool should_store_row_debug_checksums,
const longlong hidden_pk_id, uint n_key_parts,
uint *const n_null_fields,
const char *const ttl_bytes) const {
DBUG_ASSERT(tbl != nullptr);
DBUG_ASSERT(pack_buffer != nullptr);
DBUG_ASSERT(record != nullptr);
Expand Down Expand Up @@ -1166,17 +1168,6 @@ uint Rdb_key_def::pack_record(
uint null_offset = field->null_offset(tbl->record[0]);
bool maybe_null = field->real_maybe_null();

// Save the ttl duration offset in the key so we can store it in front of
// the record later.
if (ttl_pk_offset && m_ttl_duration > 0 && i == m_ttl_pk_key_part_offset) {
DBUG_ASSERT(my_strcasecmp(system_charset_info, field->field_name,
m_ttl_column.c_str()) == 0);
DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG);
DBUG_ASSERT(field->key_type() == HA_KEYTYPE_ULONGLONG);
DBUG_ASSERT(!field->real_maybe_null());
*ttl_pk_offset = tuple - packed_tuple;
}

field->move_field(const_cast<uchar *>(record) + field_offset,
maybe_null ? const_cast<uchar *>(record) + null_offset
: nullptr,
Expand Down
7 changes: 3 additions & 4 deletions storage/rocksdb/rdb_datadic.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ class Rdb_key_def {
const bool should_store_row_debug_checksums,
const longlong hidden_pk_id = 0, uint n_key_parts = 0,
uint *const n_null_fields = nullptr,
uint *const ttl_pk_offset = nullptr,
const char *const ttl_bytes = nullptr) const;
/* Pack the hidden primary key into mem-comparable form. */
uint pack_hidden_pk(const longlong hidden_pk_id,
Expand Down Expand Up @@ -371,7 +370,7 @@ class Rdb_key_def {

uint get_key_parts() const { return m_key_parts; }

uint get_ttl_field_offset() const { return m_ttl_field_offset; }
uint get_ttl_field_index() const { return m_ttl_field_index; }

/*
Get a field object for key part #part_no
Expand Down Expand Up @@ -537,7 +536,7 @@ class Rdb_key_def {
uint64 *ttl_duration);
static uint extract_ttl_col(const TABLE *const table_arg,
const Rdb_tbl_def *const tbl_def_arg,
std::string *ttl_column, uint *ttl_field_offset,
std::string *ttl_column, uint *ttl_field_index,
bool skip_checks = false);
inline bool has_ttl() const { return m_ttl_duration > 0; }

Expand Down Expand Up @@ -869,7 +868,7 @@ class Rdb_key_def {
Index of the TTL column in table->s->fields, if it exists.
Default is UINT_MAX to denote that it does not exist.
*/
uint m_ttl_field_offset;
uint m_ttl_field_index;

/* Prefix extractor for the column family of the key definiton */
std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
Expand Down

0 comments on commit 49b9040

Please sign in to comment.