Skip to content

Commit be21e62

Browse files
committed
[fix](parquet)Fix data column and null map column not equal when reading Parquet complex type cross-page data
1 parent 51a6b14 commit be21e62

File tree

2 files changed

+30
-8
lines changed

2 files changed

+30
-8
lines changed

be/src/vec/exec/format/parquet/vparquet_column_reader.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
329329
// just read the remaining values of the last row in previous page,
330330
// so there's no a new row should be read.
331331
batch_size = 0;
332+
/*
333+
* Since the function is repeatedly called to fetch data for the batch size,
334+
* it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the
335+
* definition and repetition levels of the reader only containing the latter
336+
* part of the batch (i.e., missing some parts). Therefore, when using the
337+
* definition and repetition levels to fill the null_map for structs and maps,
338+
* the function should not be called multiple times before filling.
339+
* todo:
340+
* We may need to consider reading the entire batch of data at once, as this approach
341+
* would be more user-friendly in terms of function usage. However, we must consider that if the
342+
* data spans multiple pages, memory usage may increase significantly.
343+
*/
332344
} else {
333345
_rep_levels.resize(0);
334346
_def_levels.resize(0);
@@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
835847
continue;
836848
}
837849

838-
_read_column_names.insert(doris_name);
850+
_read_column_names.emplace_back(doris_name);
839851

840852
// select_vector.reset();
841853
size_t field_rows = 0;
@@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
847859
is_dict_filter));
848860
*read_rows = field_rows;
849861
*eof = field_eof;
862+
/*
863+
* Considering the issue in the `_read_nested_column` function where data may span across pages, leading
864+
* to missing definition and repetition levels, when filling the null_map of the struct later, it is
865+
* crucial to use the definition and repetition levels from the first read column
866+
* (since `_read_nested_column` is not called repeatedly).
867+
*
868+
* It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map,
869+
* and selecting the shortest one will offer better performance
870+
*/
850871
} else {
851872
while (field_rows < *read_rows && !field_eof) {
852873
size_t loop_rows = 0;

be/src/vec/exec/format/parquet/vparquet_column_reader.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -298,24 +298,25 @@ class StructColumnReader : public ParquetColumnReader {
298298
if (!_read_column_names.empty()) {
299299
// can't use _child_readers[*_read_column_names.begin()]
300300
// because the operator[] of std::unordered_map is not const :(
301-
return _child_readers.find(*_read_column_names.begin())->second->get_rep_level();
301+
return _child_readers.find(_read_column_names.front())->second->get_rep_level();
302302
}
303303
return _child_readers.begin()->second->get_rep_level();
304304
}
305305

306306
const std::vector<level_t>& get_def_level() const override {
307307
if (!_read_column_names.empty()) {
308-
return _child_readers.find(*_read_column_names.begin())->second->get_def_level();
308+
//_read_nested_column
309+
return _child_readers.find(_read_column_names.front())->second->get_def_level();
309310
}
310311
return _child_readers.begin()->second->get_def_level();
311312
}
312313

313314
Statistics statistics() override {
314315
Statistics st;
315-
for (const auto& reader : _child_readers) {
316-
// make sure the field is read
317-
if (_read_column_names.find(reader.first) != _read_column_names.end()) {
318-
Statistics cst = reader.second->statistics();
316+
for (const auto& column_name : _read_column_names) {
317+
auto reader = _child_readers.find(column_name);
318+
if (reader != _child_readers.end()) {
319+
Statistics cst = reader->second->statistics();
319320
st.merge(cst);
320321
}
321322
}
@@ -332,7 +333,7 @@ class StructColumnReader : public ParquetColumnReader {
332333

333334
private:
334335
std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>> _child_readers;
335-
std::set<std::string> _read_column_names;
336+
std::vector<std::string> _read_column_names;
336337
};
337338

338339
}; // namespace doris::vectorized

0 commit comments

Comments
 (0)