Skip to content

Commit 85cc831

Browse files
committed
MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
Variant #2. When Histogram::point_selectivity() sees that the point value of interest falls into one bucket, it tries to guess whether the bucket has many different (unpopular) values or a few popular values. (The number of rows is fixed, as it's a Height-balanced histogram). The basis for this guess is the "width" of the value range the bucket covers. Buckets covering wider value ranges are assumed to contain values with proportionally lower frequencies. This is just a [brave] guesswork. For a very narrow bucket, it may produce an estimate that's larger than total #rows in the bucket or even in the whole table. Remove the guesswork and replace it with basic logic: return either the per-table average selectivity of col=const, or selectivity of one bucket, whichever is lower.
1 parent bc97057 commit 85cc831

File tree

5 files changed

+234
-53
lines changed

5 files changed

+234
-53
lines changed

mysql-test/main/selectivity.result

Lines changed: 73 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ flush table t1;
834834
set optimizer_use_condition_selectivity=4;
835835
explain extended select * from t1 where a=0;
836836
id select_type table type possible_keys key key_len ref rows filtered Extra
837-
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
837+
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.78 Using where
838838
Warnings:
839839
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
840840
drop table t1;
@@ -1649,7 +1649,7 @@ test.t1 analyze status Table is already up to date
16491649
# Check what info the optimizer has about selectivities
16501650
explain extended select * from t1 use index () where a in (17,51,5);
16511651
id select_type table type possible_keys key key_len ref rows filtered Extra
1652-
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
1652+
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
16531653
Warnings:
16541654
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
16551655
explain extended select * from t1 use index () where b=2;
@@ -1935,9 +1935,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
19351935
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
19361936
Warnings:
19371937
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
1938+
DROP TABLE t1;
1939+
# End of 10.2 tests
1940+
#
1941+
# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
1942+
#
1943+
create table t0(a int);
1944+
insert into t0 select 1 from seq_1_to_78;
1945+
create table t1(a int);
1946+
insert into t1 select 1 from seq_1_to_26;
1947+
create table t10 (a int);
1948+
insert into t10 select 0 from t0, seq_1_to_4;
1949+
insert into t10 select 8693 from t1;
1950+
insert into t10 select 8694 from t1;
1951+
insert into t10 select 8695 from t1;
1952+
insert into t10 select 34783 from t1;
1953+
insert into t10 select 34784 from t1;
1954+
insert into t10 select 34785 from t1;
1955+
insert into t10 select 34785 from t0, seq_1_to_8;
1956+
insert into t10 select 65214 from t1;
1957+
insert into t10 select 65215 from t1;
1958+
insert into t10 select 65216 from t1;
1959+
insert into t10 select 65216 from t0, seq_1_to_52;
1960+
insert into t10 select 65217 from t1;
1961+
insert into t10 select 65218 from t1;
1962+
insert into t10 select 65219 from t1;
1963+
insert into t10 select 65219 from t0;
1964+
insert into t10 select 73913 from t1;
1965+
insert into t10 select 73914 from t1;
1966+
insert into t10 select 73915 from t1;
1967+
insert into t10 select 73915 from t0, seq_1_to_40;
1968+
insert into t10 select 78257 from t1;
1969+
insert into t10 select 78258 from t1;
1970+
insert into t10 select 78259 from t1;
1971+
insert into t10 select 91300 from t1;
1972+
insert into t10 select 91301 from t1;
1973+
insert into t10 select 91302 from t1;
1974+
insert into t10 select 91302 from t0, seq_1_to_6;
1975+
insert into t10 select 91303 from t1;
1976+
insert into t10 select 91304 from t1;
1977+
insert into t10 select 91305 from t1;
1978+
insert into t10 select 91305 from t0, seq_1_to_8;
1979+
insert into t10 select 99998 from t1;
1980+
insert into t10 select 99999 from t1;
1981+
insert into t10 select 100000 from t1;
1982+
set use_stat_tables=preferably;
1983+
analyze table t10 persistent for all;
1984+
Table Op Msg_type Msg_text
1985+
test.t10 analyze status Engine-independent statistics collected
1986+
test.t10 analyze status OK
1987+
flush tables;
1988+
set @tmp=@@optimizer_trace;
1989+
set optimizer_trace=1;
1990+
explain select * from t10 where a in (91303);
1991+
id select_type table type possible_keys key key_len ref rows Extra
1992+
1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
1993+
# Must have selectivity_from_histogram <= 1.0:
1994+
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
1995+
from information_schema.optimizer_trace;
1996+
json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
1997+
[
1998+
[
1999+
{
2000+
"column_name": "a",
2001+
"ranges":
2002+
["91303 <= a <= 91303"],
2003+
"selectivity_from_histogram": 0.0357
2004+
}
2005+
]
2006+
]
2007+
set optimizer_trace=@tmp;
2008+
drop table t0,t1,t10;
19382009
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
19392010
set histogram_size=@save_histogram_size;
19402011
set use_stat_tables= @save_use_stat_tables;
1941-
DROP TABLE t1;
1942-
# End of 10.2 tests
19432012
set @@global.histogram_size=@save_histogram_size;

mysql-test/main/selectivity.test

Lines changed: 82 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,14 +1319,93 @@ EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
13191319
FLUSH TABLES;
13201320

13211321
EXPLAIN EXTENDED SELECT * FROM t1 WHERE a=2;
1322-
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
1323-
set histogram_size=@save_histogram_size;
1324-
set use_stat_tables= @save_use_stat_tables;
13251322

13261323
DROP TABLE t1;
13271324

13281325
--echo # End of 10.2 tests
13291326

1327+
--echo #
1328+
--echo # MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
1329+
--echo #
1330+
create table t0(a int); # This holds how many rows we hold in a bucket.
1331+
insert into t0 select 1 from seq_1_to_78;
1332+
1333+
create table t1(a int); # one-third of a bucket
1334+
insert into t1 select 1 from seq_1_to_26;
1335+
1336+
create table t10 (a int);
1337+
insert into t10 select 0 from t0, seq_1_to_4;
1338+
1339+
insert into t10 select 8693 from t1;
1340+
insert into t10 select 8694 from t1;
1341+
insert into t10 select 8695 from t1;
1342+
1343+
1344+
insert into t10 select 34783 from t1;
1345+
insert into t10 select 34784 from t1;
1346+
insert into t10 select 34785 from t1;
1347+
1348+
1349+
insert into t10 select 34785 from t0, seq_1_to_8;
1350+
1351+
insert into t10 select 65214 from t1;
1352+
insert into t10 select 65215 from t1;
1353+
insert into t10 select 65216 from t1;
1354+
1355+
insert into t10 select 65216 from t0, seq_1_to_52;
1356+
1357+
insert into t10 select 65217 from t1;
1358+
insert into t10 select 65218 from t1;
1359+
insert into t10 select 65219 from t1;
1360+
1361+
insert into t10 select 65219 from t0;
1362+
1363+
1364+
insert into t10 select 73913 from t1;
1365+
insert into t10 select 73914 from t1;
1366+
insert into t10 select 73915 from t1;
1367+
1368+
insert into t10 select 73915 from t0, seq_1_to_40;
1369+
1370+
1371+
insert into t10 select 78257 from t1;
1372+
insert into t10 select 78258 from t1;
1373+
insert into t10 select 78259 from t1;
1374+
1375+
insert into t10 select 91300 from t1;
1376+
insert into t10 select 91301 from t1;
1377+
insert into t10 select 91302 from t1;
1378+
1379+
insert into t10 select 91302 from t0, seq_1_to_6;
1380+
1381+
insert into t10 select 91303 from t1; # Only 1/3rd of bucket matches the search tuple
1382+
insert into t10 select 91304 from t1;
1383+
insert into t10 select 91305 from t1;
1384+
1385+
insert into t10 select 91305 from t0, seq_1_to_8;
1386+
1387+
insert into t10 select 99998 from t1;
1388+
insert into t10 select 99999 from t1;
1389+
insert into t10 select 100000 from t1;
1390+
1391+
set use_stat_tables=preferably;
1392+
analyze table t10 persistent for all;
1393+
flush tables;
1394+
1395+
set @tmp=@@optimizer_trace;
1396+
set optimizer_trace=1;
1397+
explain select * from t10 where a in (91303);
1398+
1399+
--echo # Must have selectivity_from_histogram <= 1.0:
1400+
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
1401+
from information_schema.optimizer_trace;
1402+
1403+
set optimizer_trace=@tmp;
1404+
drop table t0,t1,t10;
1405+
1406+
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
1407+
set histogram_size=@save_histogram_size;
1408+
set use_stat_tables= @save_use_stat_tables;
13301409
#
13311410
# Clean up
13321411
#

mysql-test/main/selectivity_innodb.result

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ flush table t1;
843843
set optimizer_use_condition_selectivity=4;
844844
explain extended select * from t1 where a=0;
845845
id select_type table type possible_keys key key_len ref rows filtered Extra
846-
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.39 Using where
846+
1 SIMPLE t1 ALL NULL NULL NULL NULL 1025 0.78 Using where
847847
Warnings:
848848
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 0
849849
drop table t1;
@@ -1659,7 +1659,7 @@ test.t1 analyze status OK
16591659
# Check what info the optimizer has about selectivities
16601660
explain extended select * from t1 use index () where a in (17,51,5);
16611661
id select_type table type possible_keys key key_len ref rows filtered Extra
1662-
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.90 Using where
1662+
1 SIMPLE t1 ALL NULL NULL NULL NULL 1000 3.91 Using where
16631663
Warnings:
16641664
Note 1003 select `test`.`t1`.`a` AS `a`,`test`.`t1`.`b` AS `b` from `test`.`t1` USE INDEX () where `test`.`t1`.`a` in (17,51,5)
16651665
explain extended select * from t1 use index () where b=2;
@@ -1945,11 +1945,78 @@ id select_type table type possible_keys key key_len ref rows filtered Extra
19451945
1 SIMPLE t1 ALL NULL NULL NULL NULL 5 25.00 Using where
19461946
Warnings:
19471947
Note 1003 select `test`.`t1`.`a` AS `a` from `test`.`t1` where `test`.`t1`.`a` = 2
1948+
DROP TABLE t1;
1949+
# End of 10.2 tests
1950+
#
1951+
# MDEV-31067: selectivity_from_histogram >1.0 for a DOUBLE_PREC_HB histogram
1952+
#
1953+
create table t0(a int);
1954+
insert into t0 select 1 from seq_1_to_78;
1955+
create table t1(a int);
1956+
insert into t1 select 1 from seq_1_to_26;
1957+
create table t10 (a int);
1958+
insert into t10 select 0 from t0, seq_1_to_4;
1959+
insert into t10 select 8693 from t1;
1960+
insert into t10 select 8694 from t1;
1961+
insert into t10 select 8695 from t1;
1962+
insert into t10 select 34783 from t1;
1963+
insert into t10 select 34784 from t1;
1964+
insert into t10 select 34785 from t1;
1965+
insert into t10 select 34785 from t0, seq_1_to_8;
1966+
insert into t10 select 65214 from t1;
1967+
insert into t10 select 65215 from t1;
1968+
insert into t10 select 65216 from t1;
1969+
insert into t10 select 65216 from t0, seq_1_to_52;
1970+
insert into t10 select 65217 from t1;
1971+
insert into t10 select 65218 from t1;
1972+
insert into t10 select 65219 from t1;
1973+
insert into t10 select 65219 from t0;
1974+
insert into t10 select 73913 from t1;
1975+
insert into t10 select 73914 from t1;
1976+
insert into t10 select 73915 from t1;
1977+
insert into t10 select 73915 from t0, seq_1_to_40;
1978+
insert into t10 select 78257 from t1;
1979+
insert into t10 select 78258 from t1;
1980+
insert into t10 select 78259 from t1;
1981+
insert into t10 select 91300 from t1;
1982+
insert into t10 select 91301 from t1;
1983+
insert into t10 select 91302 from t1;
1984+
insert into t10 select 91302 from t0, seq_1_to_6;
1985+
insert into t10 select 91303 from t1;
1986+
insert into t10 select 91304 from t1;
1987+
insert into t10 select 91305 from t1;
1988+
insert into t10 select 91305 from t0, seq_1_to_8;
1989+
insert into t10 select 99998 from t1;
1990+
insert into t10 select 99999 from t1;
1991+
insert into t10 select 100000 from t1;
1992+
set use_stat_tables=preferably;
1993+
analyze table t10 persistent for all;
1994+
Table Op Msg_type Msg_text
1995+
test.t10 analyze status Engine-independent statistics collected
1996+
test.t10 analyze status OK
1997+
flush tables;
1998+
set statement optimizer_trace=1 for
1999+
explain select * from t10 where a in (91303);
2000+
id select_type table type possible_keys key key_len ref rows Extra
2001+
1 SIMPLE t10 ALL NULL NULL NULL NULL 9984 Using where
2002+
# Must have selectivity_from_histogram <= 1.0:
2003+
select json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
2004+
from information_schema.optimizer_trace;
2005+
json_detailed(json_extract(trace, '$**.selectivity_for_columns'))
2006+
[
2007+
[
2008+
{
2009+
"column_name": "a",
2010+
"ranges":
2011+
["91303 <= a <= 91303"],
2012+
"selectivity_from_histogram": 0.035714283
2013+
}
2014+
]
2015+
]
2016+
drop table t0,t1,t10;
19482017
set optimizer_use_condition_selectivity= @save_optimizer_use_condition_selectivity;
19492018
set histogram_size=@save_histogram_size;
19502019
set use_stat_tables= @save_use_stat_tables;
1951-
DROP TABLE t1;
1952-
# End of 10.2 tests
19532020
set @@global.histogram_size=@save_histogram_size;
19542021
set optimizer_switch=@save_optimizer_switch_for_selectivity_test;
19552022
set @tmp_ust= @@use_stat_tables;

mysql-test/main/selectivity_no_engine.result

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@ test.t2 analyze status OK
3636
# The following two must have the same in 'Extra' column:
3737
explain extended select * from t2 where col1 IN (20, 180);
3838
id select_type table type possible_keys key key_len ref rows filtered Extra
39-
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
39+
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.00 Using where
4040
Warnings:
4141
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (20,180)
4242
explain extended select * from t2 where col1 IN (180, 20);
4343
id select_type table type possible_keys key key_len ref rows filtered Extra
44-
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.35 Using where
44+
1 SIMPLE t2 ALL NULL NULL NULL NULL 1100 1.00 Using where
4545
Warnings:
4646
Note 1003 select `test`.`t2`.`col1` AS `col1` from `test`.`t2` where `test`.`t2`.`col1` in (180,20)
4747
drop table t1, t2;
@@ -102,7 +102,7 @@ test.t1 analyze status Engine-independent statistics collected
102102
test.t1 analyze status OK
103103
explain extended select * from t1 where col1 in (1,2,3);
104104
id select_type table type possible_keys key key_len ref rows filtered Extra
105-
1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 3.37 Using where
105+
1 SIMPLE t1 ALL NULL NULL NULL NULL 10000 2.97 Using where
106106
Warnings:
107107
Note 1003 select `test`.`t1`.`col1` AS `col1` from `test`.`t1` where `test`.`t1`.`col1` in (1,2,3)
108108
# Must not cause fp division by zero, or produce nonsense numbers:

sql/sql_statistics.cc

Lines changed: 5 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3902,50 +3902,16 @@ double Histogram::point_selectivity(double pos, double avg_sel)
39023902
}
39033903
else
39043904
{
3905-
/*
3905+
/*
39063906
The value 'pos' fits within one single histogram bucket.
39073907
3908-
Histogram buckets have the same numbers of rows, but they cover
3909-
different ranges of values.
3910-
3911-
We assume that values are uniformly distributed across the [0..1] value
3912-
range.
3913-
*/
3914-
3915-
/*
3916-
If all buckets covered value ranges of the same size, the width of
3917-
value range would be:
3908+
We also have avg_sel which is per-table average selectivity of col=const.
3909+
If there are popular values, this may be larger than one bucket, so
3910+
cap the returned number by the selectivity of one bucket.
39183911
*/
39193912
double avg_bucket_width= 1.0 / (get_width() + 1);
3920-
3921-
/*
3922-
Let's see what is the width of value range that our bucket is covering.
3923-
(min==max currently. they are kept in the formula just in case we
3924-
will want to extend it to handle multi-bucket case)
3925-
*/
3926-
double inv_prec_factor= (double) 1.0 / prec_factor();
3927-
double current_bucket_width=
3928-
(max + 1 == get_width() ? 1.0 : (get_value(max) * inv_prec_factor)) -
3929-
(min == 0 ? 0.0 : (get_value(min-1) * inv_prec_factor));
3930-
3931-
DBUG_ASSERT(current_bucket_width); /* We shouldn't get a one zero-width bucket */
3932-
3933-
/*
3934-
So:
3935-
- each bucket has the same #rows
3936-
- values are unformly distributed across the [min_value,max_value] domain.
39373913

3938-
If a bucket has value range that's N times bigger then average, than
3939-
each value will have to have N times fewer rows than average.
3940-
*/
3941-
sel= avg_sel * avg_bucket_width / current_bucket_width;
3942-
3943-
/*
3944-
(Q: if we just follow this proportion we may end up in a situation
3945-
where number of different values we expect to find in this bucket
3946-
exceeds the number of rows that this histogram has in a bucket. Are
3947-
we ok with this or we would want to have certain caps?)
3948-
*/
3914+
sel= MY_MIN(avg_bucket_width, avg_sel);
39493915
}
39503916
return sel;
39513917
}

0 commit comments

Comments
 (0)