@@ -329,6 +329,18 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
329
329
// just read the remaining values of the last row in previous page,
330
330
// so there's no a new row should be read.
331
331
batch_size = 0 ;
332
+ /*
333
+ * Since the function is repeatedly called to fetch data for the batch size,
334
+ * it causes `_rep_levels.resize(0); _def_levels.resize(0);`, resulting in the
335
+ * definition and repetition levels of the reader only containing the latter
336
+ * part of the batch (i.e., missing some parts). Therefore, when using the
337
+ * definition and repetition levels to fill the null_map for structs and maps,
338
+ * the function should not be called multiple times before filling.
339
+ * todo:
340
+ * We may need to consider reading the entire batch of data at once, as this approach
341
+ * would be more user-friendly in terms of function usage. However, we must consider that if the
342
+ * data spans multiple pages, memory usage may increase significantly.
343
+ */
332
344
} else {
333
345
_rep_levels.resize (0 );
334
346
_def_levels.resize (0 );
@@ -835,7 +847,7 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
835
847
continue ;
836
848
}
837
849
838
- _read_column_names.insert (doris_name);
850
+ _read_column_names.emplace_back (doris_name);
839
851
840
852
// select_vector.reset();
841
853
size_t field_rows = 0 ;
@@ -847,6 +859,15 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
847
859
is_dict_filter));
848
860
*read_rows = field_rows;
849
861
*eof = field_eof;
862
+ /*
863
+ * Considering the issue in the `_read_nested_column` function where data may span across pages, leading
864
+ * to missing definition and repetition levels, when filling the null_map of the struct later, it is
865
+ * crucial to use the definition and repetition levels from the first read column
866
+ * (since `_read_nested_column` is not called repeatedly).
867
+ *
868
+ * It is worth mentioning that, theoretically, any sub-column can be chosen to fill the null_map,
869
+ * and selecting the shortest one will offer better performance
870
+ */
850
871
} else {
851
872
while (field_rows < *read_rows && !field_eof) {
852
873
size_t loop_rows = 0 ;
0 commit comments