[fix](parquet)Fix data column and null map column not equal when reading Parquet complex type cross-page data

hubgeter · hubgeter · commit be21e62c09e3 · 2025-02-11T14:18:40.000+08:00
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp
@@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
         // just read the remaining values of the last row in previous page,
         // so there's no a new row should be read.
         batch_size = 0;
+        /*
+         * Since the function is repeatedly called to fetch data for the batch size,
+         * it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the
+         * definition and repetition levels of the reader only containing the latter
+         * part of the batch (i.e., missing some parts). Therefore, when using the
+         * definition and repetition levels to fill the null_map for structs and maps,
+         * the function should not be called multiple times before filling.
+        * todo:
+         * We may need to consider reading the entire batch of data at once, as this approach
+         * would be more user-friendly in terms of function usage. However, we must consider that if the
+         * data spans multiple pages, memory usage may increase significantly.
+         */
     } else {
         _rep_levels.resize(0);
         _def_levels.resize(0);
@@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
             continue;
         }
 
-        _read_column_names.insert(doris_name);
+        _read_column_names.emplace_back(doris_name);
 
         //        select_vector.reset();
         size_t field_rows = 0;
@@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
                     is_dict_filter));
             *read_rows = field_rows;
             *eof = field_eof;
+            /*
+             * Considering the issue in the `_read_nested_column` function where data may span across pages, leading
+             * to missing definition and repetition levels, when filling the null_map of the struct later, it is
+             * crucial to use the definition and repetition levels from the first read column
+             * (since `_read_nested_column` is not called repeatedly).
+             *
+             *  It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map,
+             *  and selecting the shortest one will offer better performance
+             */
         } else {
             while (field_rows < *read_rows && !field_eof) {
                 size_t loop_rows = 0;
diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h
@@ -298,24 +298,25 @@ class StructColumnReader : public ParquetColumnReader {
         if (!_read_column_names.empty()) {
             // can't use _child_readers[*_read_column_names.begin()]
             // because the operator[] of std::unordered_map is not const :(
-            return _child_readers.find(*_read_column_names.begin())->second->get_rep_level();
+            return _child_readers.find(_read_column_names.front())->second->get_rep_level();
         }
         return _child_readers.begin()->second->get_rep_level();
     }
 
     const std::vector<level_t>& get_def_level() const override {
         if (!_read_column_names.empty()) {
-            return _child_readers.find(*_read_column_names.begin())->second->get_def_level();
+            //_read_nested_column
+            return _child_readers.find(_read_column_names.front())->second->get_def_level();
         }
         return _child_readers.begin()->second->get_def_level();
     }
 
     Statistics statistics() override {
         Statistics st;
-        for (const auto& reader : _child_readers) {
-            // make sure the field is read
-            if (_read_column_names.find(reader.first) != _read_column_names.end()) {
-                Statistics cst = reader.second->statistics();
+        for (const auto& column_name : _read_column_names) {
+            auto reader = _child_readers.find(column_name);
+            if (reader != _child_readers.end()) {
+                Statistics cst = reader->second->statistics();
                 st.merge(cst);
             }
         }
@@ -332,7 +333,7 @@ class StructColumnReader : public ParquetColumnReader {
 
 private:
     std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>> _child_readers;
-    std::set<std::string> _read_column_names;
+    std::vector<std::string> _read_column_names;
 };
 
 }; // namespace doris::vectorized