@@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
329329 // just read the remaining values of the last row in previous page,
330330 // so there's no a new row should be read.
331331 batch_size = 0 ;
332+ /*
333+ * Since the function is repeatedly called to fetch data for the batch size,
334+ * it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the
335+ * definition and repetition levels of the reader only containing the latter
336+ * part of the batch (i.e., missing some parts). Therefore, when using the
337+ * definition and repetition levels to fill the null_map for structs and maps,
338+ * the function should not be called multiple times before filling.
339+ * todo:
340+ * We may need to consider reading the entire batch of data at once, as this approach
341+ * would be more user-friendly in terms of function usage. However, we must consider that if the
342+ * data spans multiple pages, memory usage may increase significantly.
343+ */
332344 } else {
333345 _rep_levels.resize (0 );
334346 _def_levels.resize (0 );
@@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
835847 continue ;
836848 }
837849
838- _read_column_names.insert (doris_name);
850+ _read_column_names.emplace_back (doris_name);
839851
840852 // select_vector.reset();
841853 size_t field_rows = 0 ;
@@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
847859 is_dict_filter));
848860 *read_rows = field_rows;
849861 *eof = field_eof;
862+ /*
863+ * Considering the issue in the `_read_nested_column` function where data may span across pages, leading
864+ * to missing definition and repetition levels, when filling the null_map of the struct later, it is
865+ * crucial to use the definition and repetition levels from the first read column
866+ * (since `_read_nested_column` is not called repeatedly).
867+ *
868+ * It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map,
869+ * and selecting the shortest one will offer better performance
870+ */
850871 } else {
851872 while (field_rows < *read_rows && !field_eof) {
852873 size_t loop_rows = 0 ;
0 commit comments