Skip to content

Commit

Permalink
[fix](parquet)Fix data column and null map column not equal when read…
Browse files Browse the repository at this point in the history
…ing Parquet complex type cross-page data
  • Loading branch information
hubgeter committed Feb 11, 2025
1 parent 51a6b14 commit be21e62
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 8 deletions.
23 changes: 22 additions & 1 deletion be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
// just read the remaining values of the last row in previous page,
// so there's no a new row should be read.
batch_size = 0;
/*
* Since the function is repeatedly called to fetch data for the batch size,
* it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the
* definition and repetition levels of the reader only containing the latter
* part of the batch (i.e., missing some parts). Therefore, when using the
* definition and repetition levels to fill the null_map for structs and maps,
* the function should not be called multiple times before filling.
* todo:
* We may need to consider reading the entire batch of data at once, as this approach
* would be more user-friendly in terms of function usage. However, we must consider that if the
* data spans multiple pages, memory usage may increase significantly.
*/
} else {
_rep_levels.resize(0);
_def_levels.resize(0);
Expand Down Expand Up @@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
continue;
}

_read_column_names.insert(doris_name);
_read_column_names.emplace_back(doris_name);

// select_vector.reset();
size_t field_rows = 0;
Expand All @@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
is_dict_filter));
*read_rows = field_rows;
*eof = field_eof;
/*
* Considering the issue in the `_read_nested_column` function where data may span across pages, leading
* to missing definition and repetition levels, when filling the null_map of the struct later, it is
* crucial to use the definition and repetition levels from the first read column
* (since `_read_nested_column` is not called repeatedly).
*
* It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map,
* and selecting the shortest one will offer better performance
*/
} else {
while (field_rows < *read_rows && !field_eof) {
size_t loop_rows = 0;
Expand Down
15 changes: 8 additions & 7 deletions be/src/vec/exec/format/parquet/vparquet_column_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,24 +298,25 @@ class StructColumnReader : public ParquetColumnReader {
if (!_read_column_names.empty()) {
// can't use _child_readers[*_read_column_names.begin()]
// because the operator[] of std::unordered_map is not const :(
return _child_readers.find(*_read_column_names.begin())->second->get_rep_level();
return _child_readers.find(_read_column_names.front())->second->get_rep_level();
}
return _child_readers.begin()->second->get_rep_level();
}

const std::vector<level_t>& get_def_level() const override {
if (!_read_column_names.empty()) {
return _child_readers.find(*_read_column_names.begin())->second->get_def_level();
//_read_nested_column
return _child_readers.find(_read_column_names.front())->second->get_def_level();
}
return _child_readers.begin()->second->get_def_level();
}

Statistics statistics() override {
Statistics st;
for (const auto& reader : _child_readers) {
// make sure the field is read
if (_read_column_names.find(reader.first) != _read_column_names.end()) {
Statistics cst = reader.second->statistics();
for (const auto& column_name : _read_column_names) {
auto reader = _child_readers.find(column_name);
if (reader != _child_readers.end()) {
Statistics cst = reader->second->statistics();
st.merge(cst);
}
}
Expand All @@ -332,7 +333,7 @@ class StructColumnReader : public ParquetColumnReader {

private:
std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>> _child_readers;
std::set<std::string> _read_column_names;
std::vector<std::string> _read_column_names;
};

}; // namespace doris::vectorized

0 comments on commit be21e62

Please sign in to comment.