From be21e62c09e3e5c96dd4138e2dba6f64c1f20592 Mon Sep 17 00:00:00 2001 From: daidai Date: Tue, 11 Feb 2025 11:56:58 +0800 Subject: [PATCH] [fix](parquet)Fix data column and null map column not equal when reading Parquet complex type cross-page data --- .../format/parquet/vparquet_column_reader.cpp | 23 ++++++++++++++++++- .../format/parquet/vparquet_column_reader.h | 15 ++++++------ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index ade98ada48bd1f..c36fda023a2c28 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType // just read the remaining values of the last row in previous page, // so there's no a new row should be read. batch_size = 0; + /* + * Since the function is repeatedly called to fetch data for the batch size, + * it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the + * definition and repetition levels of the reader only containing the latter + * part of the batch (i.e., missing some parts). Therefore, when using the + * definition and repetition levels to fill the null_map for structs and maps, + * the function should not be called multiple times before filling. + * todo: + * We may need to consider reading the entire batch of data at once, as this approach + * would be more user-friendly in terms of function usage. However, we must consider that if the + * data spans multiple pages, memory usage may increase significantly. + */ } else { _rep_levels.resize(0); _def_levels.resize(0); @@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr continue; } - _read_column_names.insert(doris_name); + _read_column_names.emplace_back(doris_name); // select_vector.reset(); size_t field_rows = 0; @@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr is_dict_filter)); *read_rows = field_rows; *eof = field_eof; + /* + * Considering the issue in the `_read_nested_column` function where data may span across pages, leading + * to missing definition and repetition levels, when filling the null_map of the struct later, it is + * crucial to use the definition and repetition levels from the first read column + * (since `_read_nested_column` is not called repeatedly). + * + * It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map, + * and selecting the shortest one will offer better performance + */ } else { while (field_rows < *read_rows && !field_eof) { size_t loop_rows = 0; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index 5ced83a498e258..ec466fd31f0119 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -298,24 +298,25 @@ class StructColumnReader : public ParquetColumnReader { if (!_read_column_names.empty()) { // can't use _child_readers[*_read_column_names.begin()] // because the operator[] of std::unordered_map is not const :( - return _child_readers.find(*_read_column_names.begin())->second->get_rep_level(); + return _child_readers.find(_read_column_names.front())->second->get_rep_level(); } return _child_readers.begin()->second->get_rep_level(); } const std::vector& get_def_level() const override { if (!_read_column_names.empty()) { - return _child_readers.find(*_read_column_names.begin())->second->get_def_level(); + //_read_nested_column + return _child_readers.find(_read_column_names.front())->second->get_def_level(); } return _child_readers.begin()->second->get_def_level(); } Statistics statistics() override { Statistics st; - for (const auto& reader : _child_readers) { - // make sure the field is read - if (_read_column_names.find(reader.first) != _read_column_names.end()) { - Statistics cst = reader.second->statistics(); + for (const auto& column_name : _read_column_names) { + auto reader = _child_readers.find(column_name); + if (reader != _child_readers.end()) { + Statistics cst = reader->second->statistics(); st.merge(cst); } } @@ -332,7 +333,7 @@ class StructColumnReader : public ParquetColumnReader { private: std::unordered_map> _child_readers; - std::set _read_column_names; + std::vector _read_column_names; }; }; // namespace doris::vectorized