Skip to content

Commit 4b1902c

Browse files
parthchandrasudheeshkatkam
authored andcommitted
DRILL-5009: Skip reading of empty row groups while reading Parquet metadata
+ We will no longer attempt to scan such row groups. closes #651
1 parent e03507a commit 4b1902c

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

contrib/storage-hive/core/src/main/java/org/apache/drill/exec/store/hive/HiveDrillNativeScanBatchCreator.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ public ScanBatch getBatch(FragmentContext context, HiveDrillNativeParquetSubScan
119119
final List<Integer> rowGroupNums = getRowGroupNumbersFromFileSplit(fileSplit, parquetMetadata);
120120

121121
for(int rowGroupNum : rowGroupNums) {
122+
//DRILL-5009 : Skip the row group if the row count is zero
123+
if (parquetMetadata.getBlocks().get(rowGroupNum).getRowCount() == 0) {
124+
continue;
125+
}
122126
// Drill has only ever written a single row group per file, only detect corruption
123127
// in the first row group
124128
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates =

exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import java.util.Arrays;
2323
import java.util.List;
2424
import java.util.Map;
25+
import java.util.Iterator;
26+
2527
import java.util.concurrent.ConcurrentHashMap;
2628
import java.util.concurrent.TimeUnit;
2729

@@ -64,10 +66,12 @@
6466
import com.fasterxml.jackson.core.JsonProcessingException;
6567
import com.fasterxml.jackson.databind.DeserializationContext;
6668
import com.fasterxml.jackson.databind.DeserializationFeature;
69+
import com.fasterxml.jackson.databind.JsonNode;
6770
import com.fasterxml.jackson.databind.JsonDeserializer;
6871
import com.fasterxml.jackson.databind.JsonSerializer;
6972
import com.fasterxml.jackson.databind.KeyDeserializer;
7073
import com.fasterxml.jackson.databind.ObjectMapper;
74+
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
7175
import com.fasterxml.jackson.databind.SerializerProvider;
7276
import com.fasterxml.jackson.databind.module.SimpleModule;
7377
import com.fasterxml.jackson.module.afterburner.AfterburnerModule;
@@ -437,6 +441,11 @@ private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3
437441
length += col.getTotalSize();
438442
}
439443

444+
// DRILL-5009: Skip the RowGroup if it is empty
445+
// Note we still read the schema even if there are no values in the RowGroup
446+
if (rowGroup.getRowCount() == 0) {
447+
continue;
448+
}
440449
RowGroupMetadata_v3 rowGroupMeta =
441450
new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
442451
getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
@@ -566,6 +575,19 @@ private void readBlockMeta(String path,
566575
(createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft();
567576
newMetadata = true;
568577
}
578+
579+
// DRILL-5009: Remove the RowGroup if it is empty
580+
List<? extends ParquetFileMetadata> files = parquetTableMetadata.getFiles();
581+
for (ParquetFileMetadata file : files) {
582+
List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
583+
for (Iterator<? extends RowGroupMetadata> iter = rowGroups.iterator(); iter.hasNext(); ) {
584+
RowGroupMetadata r = iter.next();
585+
if (r.getRowCount() == 0) {
586+
iter.remove();
587+
}
588+
}
589+
}
590+
569591
}
570592

571593
if (newMetadata && metaContext != null) {

0 commit comments

Comments
 (0)