Skip to content

Commit

Permalink
Issue percona#67: Inefficient index condition pushdown
Browse files Browse the repository at this point in the history
Summary:
Inside index_next_same() call, we should
1. first check whether the record matches the index
   lookup prefix,
2. then check pushed index condition.

If we try to check #2 without checking #1 first, we may walk
off the index lookup prefix and scan till the end of the index.

Differential Revision: https://reviews.facebook.net/D38769

fbshipit-source-id: 6c34e3ca0d2
  • Loading branch information
spetrunia authored and facebook-github-bot committed Mar 8, 2021
1 parent 7267065 commit f67d6ab
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 2 deletions.
34 changes: 34 additions & 0 deletions mysql-test/suite/rocksdb/r/rocksdb_icp.result
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,37 @@ ROWS_INDEX_NEXT-@rin 0
drop table t4;
drop procedure save_read_stats;
drop procedure get_read_stats;
#
# Issue #67: Inefficient index condition pushdown
#
create table t0 (a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
create table t1 (
pk int not null primary key,
key1 bigint(20) unsigned,
col1 int,
key (key1)
) engine=rocksdb;
insert into t1
select
A.a+10*B.a+100*C.a,
A.a+10*B.a+100*C.a,
1234
from t0 A, t0 B, t0 C;
set @count=0;
explain
select * from t1 where key1=1;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ref key1 key1 9 const 10 Using index condition
set @count_diff =(select (value - @count) from information_schema.rocksdb_perf_context
where table_schema=database() and table_name='t1' and stat_type='INTERNAL_KEY_SKIPPED_COUNT');
select * from t1 where key1=1;
pk key1 col1
1 1 1234
set @count_diff =(select (value - @count) from information_schema.rocksdb_perf_context
where table_schema=database() and table_name='t1' and stat_type='INTERNAL_KEY_SKIPPED_COUNT');
# The following must be =1, or in any case not 999:
select @count_diff as "INTERNAL_KEY_SKIPPED_COUNT increment";
INTERNAL_KEY_SKIPPED_COUNT increment
1
drop table t0,t1;
36 changes: 36 additions & 0 deletions mysql-test/suite/rocksdb/t/rocksdb_icp.test
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,39 @@ let $cf_name=cf1;

--source suite/rocksdb/t/rocksdb_icp.inc

--echo #
--echo # Issue #67: Inefficient index condition pushdown
--echo #
create table t0 (a int);
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);

create table t1 (
pk int not null primary key,
key1 bigint(20) unsigned,
col1 int,
key (key1)
) engine=rocksdb;

insert into t1
select
A.a+10*B.a+100*C.a,
A.a+10*B.a+100*C.a,
1234
from t0 A, t0 B, t0 C;

set @count=0;
let $save_query=
set @count_diff =(select (value - @count) from information_schema.rocksdb_perf_context
where table_schema=database() and table_name='t1' and stat_type='INTERNAL_KEY_SKIPPED_COUNT');

explain
select * from t1 where key1=1;

eval $save_query;
select * from t1 where key1=1;
eval $save_query;
--echo # The following must be =1, or in any case not 999:
select @count_diff as "INTERNAL_KEY_SKIPPED_COUNT increment";

drop table t0,t1;

43 changes: 41 additions & 2 deletions storage/rocksdb/ha_rocksdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1924,6 +1924,7 @@ ha_rocksdb::ha_rocksdb(handlerton *hton, TABLE_SHARE *table_arg)
pk_can_be_decoded(false),
pk_tuple(NULL), pk_packed_tuple(NULL),
sec_key_packed_tuple(NULL), sec_key_tails(NULL),
sec_key_match_prefix(NULL), sec_key_match_prefix_buf(NULL),
sec_key_packed_tuple_old(NULL), sec_key_tails_old(NULL),
pack_buffer(NULL),
lock_rows(FALSE),
Expand Down Expand Up @@ -2343,6 +2344,8 @@ int ha_rocksdb::open(const char *name, int mode, uint test_if_locked)

if (!(sec_key_packed_tuple= (uchar*)my_malloc(max_packed_sec_key_len,
MYF(0))) ||
!(sec_key_match_prefix_buf= (uchar*)my_malloc(max_packed_sec_key_len,
MYF(0))) ||
!(sec_key_packed_tuple_old= (uchar*)my_malloc(max_packed_sec_key_len,
MYF(0))) ||
!((sec_key_tails= (uchar*)my_malloc(max_packed_sec_key_len, MYF(0)))) ||
Expand Down Expand Up @@ -2379,6 +2382,10 @@ int ha_rocksdb::close(void)
my_free(sec_key_packed_tuple);
sec_key_packed_tuple= NULL;

if (sec_key_match_prefix_buf)
my_free(sec_key_match_prefix_buf);
sec_key_match_prefix_buf= NULL;

if (sec_key_packed_tuple_old)
my_free(sec_key_packed_tuple_old);
sec_key_packed_tuple_old= NULL;
Expand Down Expand Up @@ -2844,6 +2851,8 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
(keypart_map == ((key_part_map(1) << actual_key_parts)
- 1));

/* By default, we don't need the retrieved records to match the prefix */
sec_key_match_prefix= NULL;
stats.rows_requested++;
io_perf_start();

Expand All @@ -2869,8 +2878,7 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
/*
Unique secondary index performs lookups without the extended key fields
*/
uint packed_size;
if (active_index != table->s->primary_key &&
uint packed_size; if (active_index != table->s->primary_key &&
table->key_info[active_index].flags & HA_NOSAME &&
find_flag == HA_READ_KEY_EXACT && using_full_key)
{
Expand All @@ -2886,6 +2894,25 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
packed_size= kd->pack_index_tuple(table, pack_buffer,
sec_key_packed_tuple, key,
keypart_map);
if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST))
{
/*
We are doing a point index lookup, and ICP is enabled. It is possible
that this call will be followed by ha_rocksdb->index_next_same() call.
Do what InnoDB does: save the lookup tuple now. We will need it in
index_next_same/find_icp_matching_index_rec in order to stop scanning
as soon as index record doesn't match the lookup tuple.
When not using ICP, handler::index_next_same() will make sure that rows
that don't match the lookup prefix are not returned.
row matches the lookup prefix.
*/
sec_key_match_prefix= sec_key_match_prefix_buf;
sec_key_match_length= packed_size;
memcpy(sec_key_match_prefix, sec_key_packed_tuple, packed_size);
}

if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
find_flag == HA_READ_PREFIX_LAST ||
Expand Down Expand Up @@ -3205,6 +3232,16 @@ int ha_rocksdb::find_icp_matching_index_rec(bool move_forward, uchar *buf)
return HA_ERR_END_OF_FILE;
}

if (sec_key_match_prefix)
{
rocksdb::Slice prefix((const char*)sec_key_match_prefix, sec_key_match_length);
if (!kd->value_matches_prefix(rkey, prefix))
{
table->status= STATUS_NOT_FOUND;
return HA_ERR_END_OF_FILE;
}
}

rocksdb::Slice value= scan_it->value();
if (kd->unpack_record(table, buf, &rkey, &value))
{
Expand Down Expand Up @@ -3543,6 +3580,7 @@ int ha_rocksdb::index_next_with_direction(uchar *buf, bool move_forward)
int ha_rocksdb::index_first(uchar *buf)
{
io_perf_start();
sec_key_match_prefix= NULL;
int rc= key_descr[active_index]->is_reverse_cf ? index_last_intern(buf):
index_first_intern(buf);
io_perf_end_and_record();
Expand All @@ -3553,6 +3591,7 @@ int ha_rocksdb::index_first(uchar *buf)
int ha_rocksdb::index_last(uchar *buf)
{
io_perf_start();
sec_key_match_prefix= NULL;
int rc= key_descr[active_index]->is_reverse_cf ? index_first_intern(buf):
index_last_intern(buf);
io_perf_end_and_record();
Expand Down
10 changes: 10 additions & 0 deletions storage/rocksdb/ha_rocksdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,16 @@ class ha_rocksdb: public handler
*/
uchar *sec_key_tails;

/*
ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
mem-comparable form of the index lookup tuple.
*/
uchar *sec_key_match_prefix;
uint sec_key_match_length;

/* Buffer space for the above */
uchar *sec_key_match_prefix_buf;

/* Second buffers, used by UPDATE. */
uchar *sec_key_packed_tuple_old;
uchar *sec_key_tails_old;
Expand Down

0 comments on commit f67d6ab

Please sign in to comment.