Skip to content

Commit e91468b

Browse files
[enhance](runtime filter) impl partition pruning in runtime filer (#47025)
This PR implements partition pruning through runtime filters. When executing a SQL query like: ```sql SELECT count(*) FROM int_partition_table WHERE partition_col = ( SELECT partition_col FROM int_partition_table GROUP BY partition_col HAVING count(*) > 0 ORDER BY partition_col DESC LIMIT 1 ) ``` During execution, the backend (BE) will receive a dynamic runtime filter condition `partition_col = xxx`. Since partition_col is a partitioning column, we can use its value to determine if the partition can be pruned. Additionally, this mechanism also supports filtering queries like: ```sql SELECT count(*) FROM int_partition_table WHERE func(partition_col) = xxx ``` If func cannot be evaluated at the frontend (FE), the frontend will not perform partition pruning. However, since the backend can compute func, this mechanism allows us to handle pruning scenarios that are not possible at the frontend, providing a more efficient pruning process on the backend side.
1 parent 74e82fd commit e91468b

File tree

21 files changed

+398
-11
lines changed

21 files changed

+398
-11
lines changed

be/src/vec/core/block.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void Block::initialize_index_by_name() {
147147
void Block::insert(size_t position, const ColumnWithTypeAndName& elem) {
148148
if (position > data.size()) {
149149
throw Exception(ErrorCode::INTERNAL_ERROR,
150-
"invalid input position, position={}, data.size{}, names={}", position,
150+
"invalid input position, position={}, data.size={}, names={}", position,
151151
data.size(), dump_names());
152152
}
153153

@@ -164,7 +164,7 @@ void Block::insert(size_t position, const ColumnWithTypeAndName& elem) {
164164
void Block::insert(size_t position, ColumnWithTypeAndName&& elem) {
165165
if (position > data.size()) {
166166
throw Exception(ErrorCode::INTERNAL_ERROR,
167-
"invalid input position, position={}, data.size{}, names={}", position,
167+
"invalid input position, position={}, data.size={}, names={}", position,
168168
data.size(), dump_names());
169169
}
170170

be/src/vec/exec/scan/vfile_scanner.cpp

+156-6
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,29 @@
2020
#include <fmt/format.h>
2121
#include <gen_cpp/Exprs_types.h>
2222
#include <gen_cpp/Metrics_types.h>
23+
#include <gen_cpp/Opcodes_types.h>
2324
#include <gen_cpp/PaloInternalService_types.h>
2425
#include <gen_cpp/PlanNodes_types.h>
26+
#include <glog/logging.h>
2527

28+
#include <algorithm>
2629
#include <boost/iterator/iterator_facade.hpp>
2730
#include <iterator>
2831
#include <map>
32+
#include <ranges>
2933
#include <tuple>
34+
#include <unordered_map>
3035
#include <utility>
3136

3237
#include "common/compiler_util.h" // IWYU pragma: keep
3338
#include "common/config.h"
3439
#include "common/logging.h"
40+
#include "common/status.h"
3541
#include "io/cache/block_file_cache_profile.h"
3642
#include "runtime/descriptors.h"
3743
#include "runtime/runtime_state.h"
3844
#include "runtime/types.h"
45+
#include "util/runtime_profile.h"
3946
#include "vec/aggregate_functions/aggregate_function.h"
4047
#include "vec/columns/column.h"
4148
#include "vec/columns/column_nullable.h"
@@ -67,6 +74,7 @@
6774
#include "vec/exec/scan/vscan_node.h"
6875
#include "vec/exprs/vexpr.h"
6976
#include "vec/exprs/vexpr_context.h"
77+
#include "vec/exprs/vexpr_fwd.h"
7078
#include "vec/exprs/vslot_ref.h"
7179
#include "vec/functions/function.h"
7280
#include "vec/functions/function_string.h"
@@ -130,12 +138,17 @@ Status VFileScanner::prepare(RuntimeState* state, const VExprContextSPtrs& conju
130138
ADD_TIMER_WITH_LEVEL(_local_state->scanner_profile(), "FileScannerPreFilterTimer", 1);
131139
_convert_to_output_block_timer = ADD_TIMER_WITH_LEVEL(_local_state->scanner_profile(),
132140
"FileScannerConvertOuputBlockTime", 1);
141+
_runtime_filter_partition_prune_timer = ADD_TIMER_WITH_LEVEL(
142+
_local_state->scanner_profile(), "FileScannerRuntimeFilterPartitionPruningTime", 1);
133143
_empty_file_counter =
134144
ADD_COUNTER_WITH_LEVEL(_local_state->scanner_profile(), "EmptyFileNum", TUnit::UNIT, 1);
135145
_not_found_file_counter = ADD_COUNTER_WITH_LEVEL(_local_state->scanner_profile(),
136146
"NotFoundFileNum", TUnit::UNIT, 1);
137147
_file_counter =
138148
ADD_COUNTER_WITH_LEVEL(_local_state->scanner_profile(), "FileNumber", TUnit::UNIT, 1);
149+
_runtime_filter_partition_pruned_range_counter =
150+
ADD_COUNTER_WITH_LEVEL(_local_state->scanner_profile(),
151+
"RuntimeFilterPartitionPrunedRangeNum", TUnit::UNIT, 1);
139152

140153
_file_cache_statistics.reset(new io::FileCacheStatistics());
141154
_io_ctx.reset(new io::IOContext());
@@ -174,6 +187,113 @@ Status VFileScanner::prepare(RuntimeState* state, const VExprContextSPtrs& conju
174187
return Status::OK();
175188
}
176189

190+
// check if the expr is a partition pruning expr
191+
bool VFileScanner::_check_partition_prune_expr(const VExprSPtr& expr) {
192+
if (expr->is_slot_ref()) {
193+
auto* slot_ref = static_cast<VSlotRef*>(expr.get());
194+
return _partition_slot_index_map.find(slot_ref->slot_id()) !=
195+
_partition_slot_index_map.end();
196+
}
197+
if (expr->is_literal()) {
198+
return true;
199+
}
200+
return std::ranges::all_of(expr->children(), [this](const auto& child) {
201+
return _check_partition_prune_expr(child);
202+
});
203+
}
204+
205+
void VFileScanner::_init_runtime_filter_partition_prune_ctxs() {
206+
_runtime_filter_partition_prune_ctxs.clear();
207+
for (auto& conjunct : _conjuncts) {
208+
auto impl = conjunct->root()->get_impl();
209+
// If impl is not null, which means this a conjuncts from runtime filter.
210+
auto expr = impl ? impl : conjunct->root();
211+
if (_check_partition_prune_expr(expr)) {
212+
_runtime_filter_partition_prune_ctxs.emplace_back(conjunct);
213+
}
214+
}
215+
}
216+
217+
void VFileScanner::_init_runtime_filter_partition_prune_block() {
218+
// init block with empty column
219+
for (auto const* slot_desc : _real_tuple_desc->slots()) {
220+
if (!slot_desc->need_materialize()) {
221+
// should be ignored from reading
222+
continue;
223+
}
224+
_runtime_filter_partition_prune_block.insert(
225+
ColumnWithTypeAndName(slot_desc->get_empty_mutable_column(),
226+
slot_desc->get_data_type_ptr(), slot_desc->col_name()));
227+
}
228+
}
229+
230+
Status VFileScanner::_process_runtime_filters_partition_prune(bool& can_filter_all) {
231+
SCOPED_TIMER(_runtime_filter_partition_prune_timer);
232+
if (_runtime_filter_partition_prune_ctxs.empty() || _partition_col_descs.empty()) {
233+
return Status::OK();
234+
}
235+
size_t partition_value_column_size = 1;
236+
237+
// 1. Get partition key values to string columns.
238+
std::unordered_map<SlotId, MutableColumnPtr> parititon_slot_id_to_column;
239+
for (auto const& partition_col_desc : _partition_col_descs) {
240+
const auto& [partition_value, partition_slot_desc] = partition_col_desc.second;
241+
auto test_serde = partition_slot_desc->get_data_type_ptr()->get_serde();
242+
auto partition_value_column = partition_slot_desc->get_data_type_ptr()->create_column();
243+
auto* col_ptr = static_cast<IColumn*>(partition_value_column.get());
244+
Slice slice(partition_value.data(), partition_value.size());
245+
int num_deserialized = 0;
246+
RETURN_IF_ERROR(test_serde->deserialize_column_from_fixed_json(
247+
*col_ptr, slice, partition_value_column_size, &num_deserialized, {}));
248+
parititon_slot_id_to_column[partition_slot_desc->id()] = std::move(partition_value_column);
249+
}
250+
251+
// 2. Fill _runtime_filter_partition_prune_block from the partition column, then execute conjuncts and filter block.
252+
// 2.1 Fill _runtime_filter_partition_prune_block from the partition column to match the conjuncts executing.
253+
size_t index = 0;
254+
bool first_column_filled = false;
255+
for (auto const* slot_desc : _real_tuple_desc->slots()) {
256+
if (!slot_desc->need_materialize()) {
257+
// should be ignored from reading
258+
continue;
259+
}
260+
if (parititon_slot_id_to_column.find(slot_desc->id()) !=
261+
parititon_slot_id_to_column.end()) {
262+
auto data_type = slot_desc->get_data_type_ptr();
263+
auto partition_value_column = std::move(parititon_slot_id_to_column[slot_desc->id()]);
264+
if (data_type->is_nullable()) {
265+
_runtime_filter_partition_prune_block.insert(
266+
index, ColumnWithTypeAndName(
267+
ColumnNullable::create(
268+
std::move(partition_value_column),
269+
ColumnUInt8::create(partition_value_column_size, 0)),
270+
data_type, slot_desc->col_name()));
271+
} else {
272+
_runtime_filter_partition_prune_block.insert(
273+
index, ColumnWithTypeAndName(std::move(partition_value_column), data_type,
274+
slot_desc->col_name()));
275+
}
276+
if (index == 0) {
277+
first_column_filled = true;
278+
}
279+
}
280+
index++;
281+
}
282+
283+
// 2.2 Execute conjuncts.
284+
if (!first_column_filled) {
285+
// VExprContext.execute has an optimization, the filtering is executed when block->rows() > 0
286+
// The following process may be tricky and time-consuming, but we have no other way.
287+
_runtime_filter_partition_prune_block.get_by_position(0).column->assume_mutable()->resize(
288+
partition_value_column_size);
289+
}
290+
IColumn::Filter result_filter(_runtime_filter_partition_prune_block.rows(), 1);
291+
RETURN_IF_ERROR(VExprContext::execute_conjuncts(_runtime_filter_partition_prune_ctxs, nullptr,
292+
&_runtime_filter_partition_prune_block,
293+
&result_filter, &can_filter_all));
294+
return Status::OK();
295+
}
296+
177297
Status VFileScanner::_process_conjuncts_for_dict_filter() {
178298
_slot_id_to_filter_conjuncts.clear();
179299
_not_single_slot_filter_conjuncts.clear();
@@ -237,6 +357,11 @@ Status VFileScanner::open(RuntimeState* state) {
237357
RETURN_IF_ERROR(_split_source->get_next(&_first_scan_range, &_current_range));
238358
if (_first_scan_range) {
239359
RETURN_IF_ERROR(_init_expr_ctxes());
360+
if (_state->query_options().enable_runtime_filter_partition_prune &&
361+
!_partition_slot_index_map.empty()) {
362+
_init_runtime_filter_partition_prune_ctxs();
363+
_init_runtime_filter_partition_prune_block();
364+
}
240365
} else {
241366
// there's no scan range in split source. stop scanner directly.
242367
_scanner_eof = true;
@@ -752,6 +877,29 @@ Status VFileScanner::_get_next_reader() {
752877
const TFileRangeDesc& range = _current_range;
753878
_current_range_path = range.path;
754879

880+
if (!_partition_slot_descs.empty()) {
881+
// we need get partition columns first for runtime filter partition pruning
882+
RETURN_IF_ERROR(_generate_parititon_columns());
883+
884+
if (_state->query_options().enable_runtime_filter_partition_prune) {
885+
// if enable_runtime_filter_partition_prune is true, we need to check whether this range can be filtered out
886+
// by runtime filter partition prune
887+
if (_push_down_conjuncts.size() < _conjuncts.size()) {
888+
// there are new runtime filters, need to re-init runtime filter partition pruning ctxs
889+
_init_runtime_filter_partition_prune_ctxs();
890+
}
891+
892+
bool can_filter_all = false;
893+
RETURN_IF_ERROR(_process_runtime_filters_partition_prune(can_filter_all));
894+
if (can_filter_all) {
895+
// this range can be filtered out by runtime filter partition pruning
896+
// so we need to skip this range
897+
COUNTER_UPDATE(_runtime_filter_partition_pruned_range_counter, 1);
898+
continue;
899+
}
900+
}
901+
}
902+
755903
// create reader for specific format
756904
Status init_status;
757905
// for compatibility, if format_type is not set in range, use the format type of params
@@ -1012,7 +1160,8 @@ Status VFileScanner::_get_next_reader() {
10121160
_missing_cols.clear();
10131161
RETURN_IF_ERROR(_cur_reader->get_columns(&_name_to_col_type, &_missing_cols));
10141162
_cur_reader->set_push_down_agg_type(_get_push_down_agg_type());
1015-
RETURN_IF_ERROR(_generate_fill_columns());
1163+
RETURN_IF_ERROR(_generate_missing_columns());
1164+
RETURN_IF_ERROR(_cur_reader->set_fill_columns(_partition_col_descs, _missing_col_descs));
10161165
if (VLOG_NOTICE_IS_ON && !_missing_cols.empty() && _is_load) {
10171166
fmt::memory_buffer col_buf;
10181167
for (auto& col : _missing_cols) {
@@ -1042,10 +1191,8 @@ Status VFileScanner::_get_next_reader() {
10421191
return Status::OK();
10431192
}
10441193

1045-
Status VFileScanner::_generate_fill_columns() {
1194+
Status VFileScanner::_generate_parititon_columns() {
10461195
_partition_col_descs.clear();
1047-
_missing_col_descs.clear();
1048-
10491196
const TFileRangeDesc& range = _current_range;
10501197
if (range.__isset.columns_from_path && !_partition_slot_descs.empty()) {
10511198
for (const auto& slot_desc : _partition_slot_descs) {
@@ -1066,7 +1213,11 @@ Status VFileScanner::_generate_fill_columns() {
10661213
}
10671214
}
10681215
}
1216+
return Status::OK();
1217+
}
10691218

1219+
Status VFileScanner::_generate_missing_columns() {
1220+
_missing_col_descs.clear();
10701221
if (!_missing_cols.empty()) {
10711222
for (auto slot_desc : _real_tuple_desc->slots()) {
10721223
if (!slot_desc->is_materialized()) {
@@ -1084,8 +1235,7 @@ Status VFileScanner::_generate_fill_columns() {
10841235
_missing_col_descs.emplace(slot_desc->col_name(), it->second);
10851236
}
10861237
}
1087-
1088-
return _cur_reader->set_fill_columns(_partition_col_descs, _missing_col_descs);
1238+
return Status::OK();
10891239
}
10901240

10911241
Status VFileScanner::_init_expr_ctxes() {

be/src/vec/exec/scan/vfile_scanner.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "vec/core/block.h"
3939
#include "vec/exec/format/generic_reader.h"
4040
#include "vec/exec/scan/vscanner.h"
41+
#include "vec/exprs/vexpr_fwd.h"
4142

4243
namespace doris {
4344
class RuntimeState;
@@ -163,6 +164,8 @@ class VFileScanner : public VScanner {
163164
Block _src_block;
164165

165166
VExprContextSPtrs _push_down_conjuncts;
167+
VExprContextSPtrs _runtime_filter_partition_prune_ctxs;
168+
Block _runtime_filter_partition_prune_block;
166169

167170
std::unique_ptr<io::FileCacheStatistics> _file_cache_statistics;
168171
std::unique_ptr<io::IOContext> _io_ctx;
@@ -183,9 +186,11 @@ class VFileScanner : public VScanner {
183186
RuntimeProfile::Counter* _fill_missing_columns_timer = nullptr;
184187
RuntimeProfile::Counter* _pre_filter_timer = nullptr;
185188
RuntimeProfile::Counter* _convert_to_output_block_timer = nullptr;
189+
RuntimeProfile::Counter* _runtime_filter_partition_prune_timer = nullptr;
186190
RuntimeProfile::Counter* _empty_file_counter = nullptr;
187191
RuntimeProfile::Counter* _not_found_file_counter = nullptr;
188192
RuntimeProfile::Counter* _file_counter = nullptr;
193+
RuntimeProfile::Counter* _runtime_filter_partition_pruned_range_counter = nullptr;
189194

190195
const std::unordered_map<std::string, int>* _col_name_to_slot_id = nullptr;
191196
// single slot filter conjuncts
@@ -213,7 +218,12 @@ class VFileScanner : public VScanner {
213218
Status _convert_to_output_block(Block* block);
214219
Status _truncate_char_or_varchar_columns(Block* block);
215220
void _truncate_char_or_varchar_column(Block* block, int idx, int len);
216-
Status _generate_fill_columns();
221+
Status _generate_parititon_columns();
222+
Status _generate_missing_columns();
223+
bool _check_partition_prune_expr(const VExprSPtr& expr);
224+
void _init_runtime_filter_partition_prune_ctxs();
225+
void _init_runtime_filter_partition_prune_block();
226+
Status _process_runtime_filters_partition_prune(bool& is_partition_pruned);
217227
Status _process_conjuncts_for_dict_filter();
218228
Status _process_late_arrival_conjuncts();
219229
void _get_slot_ids(VExpr* expr, std::vector<int>* slot_ids);

docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run73.hql

-2
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,3 @@ INSERT INTO employees VALUES
1717

1818

1919
msck repair table employees;
20-
21-
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
create database if not exists partition_tables;
2+
use partition_tables;
3+
4+
CREATE TABLE decimal_partition_table (
5+
id INT,
6+
name STRING,
7+
value FLOAT
8+
)
9+
PARTITIONED BY (partition_col DECIMAL(10, 2))
10+
STORED AS PARQUET
11+
LOCATION '/user/doris/preinstalled_data/partition_tables/decimal_partition_table';
12+
13+
CREATE TABLE int_partition_table (
14+
id INT,
15+
name STRING,
16+
value FLOAT
17+
)
18+
PARTITIONED BY (partition_col INT)
19+
STORED AS PARQUET
20+
LOCATION '/user/doris/preinstalled_data/partition_tables/int_partition_table';
21+
22+
CREATE TABLE string_partition_table (
23+
id INT,
24+
name STRING,
25+
value FLOAT
26+
)
27+
PARTITIONED BY (partition_col STRING)
28+
STORED AS PARQUET
29+
LOCATION '/user/doris/preinstalled_data/partition_tables/string_partition_table';
30+
31+
CREATE TABLE date_partition_table (
32+
id INT,
33+
name STRING,
34+
value FLOAT
35+
)
36+
PARTITIONED BY (partition_col DATE)
37+
STORED AS PARQUET
38+
LOCATION '/user/doris/preinstalled_data/partition_tables/date_partition_table';
39+
40+
41+
msck repair table decimal_partition_table;
42+
msck repair table int_partition_table;
43+
msck repair table string_partition_table;
44+
msck repair table date_partition_table;

0 commit comments

Comments
 (0)