Skip to content

Commit b479d93

Browse files
committed
support partial update
2 parents 49a0ea9 + 1eb23de commit b479d93

File tree

34 files changed

+1256
-15
lines changed

34 files changed

+1256
-15
lines changed

.palantir/revapi.yml

+10
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ acceptedBreaks:
6565
- code: "java.method.removed"
6666
old: "method void org.apache.iceberg.io.DataWriter<T>::add(T)"
6767
justification: "Removing deprecated method"
68+
"1.1.0":
69+
org.apache.iceberg:iceberg-api:
70+
- code: "java.method.addedToInterface"
71+
new: "method java.util.List<java.lang.Integer> org.apache.iceberg.ContentFile<F>::partialFieldIds()"
72+
justification: "{add new feature}"
73+
org.apache.iceberg:iceberg-data:
74+
- code: "java.method.abstractMethodAdded"
75+
new: "method T org.apache.iceberg.data.DeleteFilter<T>::combineRecord(T, org.apache.iceberg.StructLike,\
76+
\ org.apache.iceberg.Schema, org.apache.iceberg.Schema)"
77+
justification: "{add new feature}"
6878
apache-iceberg-0.14.0:
6979
org.apache.iceberg:iceberg-api:
7080
- code: "java.class.defaultSerializationChanged"

api/src/main/java/org/apache/iceberg/ContentFile.java

+2
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ public interface ContentFile<F> {
104104
*/
105105
List<Integer> equalityFieldIds();
106106

107+
List<Integer> partialFieldIds();
108+
107109
/**
108110
* Returns the sort order id of this file, which describes how the file is ordered. This
109111
* information will be useful for merging data and equality delete files more efficiently when

api/src/main/java/org/apache/iceberg/DataFile.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,14 @@ public interface DataFile extends ContentFile<DataFile> {
102102
int PARTITION_ID = 102;
103103
String PARTITION_NAME = "partition";
104104
String PARTITION_DOC = "Partition data tuple, schema based on the partition spec";
105-
// NEXT ID TO ASSIGN: 142
105+
106+
Types.NestedField PARTIAL_IDS =
107+
optional(
108+
142,
109+
"partial_ids",
110+
ListType.ofRequired(143, IntegerType.get()),
111+
"partial comparison field IDs");
112+
// NEXT ID TO ASSIGN: 144
106113

107114
static StructType getType(StructType partitionType) {
108115
// IDs start at 100 to leave room for changes to ManifestEntry
@@ -123,7 +130,8 @@ static StructType getType(StructType partitionType) {
123130
KEY_METADATA,
124131
SPLIT_OFFSETS,
125132
EQUALITY_IDS,
126-
SORT_ORDER_ID);
133+
SORT_ORDER_ID,
134+
PARTIAL_IDS);
127135
}
128136

129137
/** @return the content stored in the file; one of DATA, POSITION_DELETES, or EQUALITY_DELETES */
@@ -136,4 +144,9 @@ default FileContent content() {
136144
default List<Integer> equalityFieldIds() {
137145
return null;
138146
}
147+
148+
@Override
149+
default List<Integer> partialFieldIds() {
150+
return null;
151+
}
139152
}

api/src/main/java/org/apache/iceberg/FileContent.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@
2222
public enum FileContent {
2323
DATA(0),
2424
POSITION_DELETES(1),
25-
EQUALITY_DELETES(2);
25+
EQUALITY_DELETES(2),
26+
PARTIAL_UPDATE(3);
2627

2728
private final int id;
2829

core/src/main/java/org/apache/iceberg/BaseFile.java

+18
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ public PartitionData copy() {
7373
private Map<Integer, ByteBuffer> upperBounds = null;
7474
private long[] splitOffsets = null;
7575
private int[] equalityIds = null;
76+
private int[] partialIds = null;
7677
private byte[] keyMetadata = null;
7778
private Integer sortOrderId;
7879

@@ -132,6 +133,7 @@ public PartitionData copy() {
132133
Map<Integer, ByteBuffer> upperBounds,
133134
List<Long> splitOffsets,
134135
int[] equalityFieldIds,
136+
int[] partialFieldIds,
135137
Integer sortOrderId,
136138
ByteBuffer keyMetadata) {
137139
this.partitionSpecId = specId;
@@ -159,6 +161,7 @@ public PartitionData copy() {
159161
this.upperBounds = SerializableByteBufferMap.wrap(upperBounds);
160162
this.splitOffsets = ArrayUtil.toLongArray(splitOffsets);
161163
this.equalityIds = equalityFieldIds;
164+
this.partialIds = partialFieldIds;
162165
this.sortOrderId = sortOrderId;
163166
this.keyMetadata = ByteBuffers.toByteArray(keyMetadata);
164167
}
@@ -207,6 +210,10 @@ public PartitionData copy() {
207210
toCopy.equalityIds != null
208211
? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length)
209212
: null;
213+
this.partialIds =
214+
toCopy.partialIds != null
215+
? Arrays.copyOf(toCopy.partialIds, toCopy.partialIds.length)
216+
: null;
210217
this.sortOrderId = toCopy.sortOrderId;
211218
}
212219

@@ -294,6 +301,9 @@ public void put(int i, Object value) {
294301
this.sortOrderId = (Integer) value;
295302
return;
296303
case 17:
304+
this.partialIds = ArrayUtil.toIntArray((List<Integer>) value);
305+
return;
306+
case 18:
297307
this.fileOrdinal = (long) value;
298308
return;
299309
default:
@@ -349,6 +359,8 @@ public Object get(int i) {
349359
case 16:
350360
return sortOrderId;
351361
case 17:
362+
return partialFieldIds();
363+
case 18:
352364
return fileOrdinal;
353365
default:
354366
throw new UnsupportedOperationException("Unknown field ordinal: " + pos);
@@ -445,6 +457,11 @@ public List<Integer> equalityFieldIds() {
445457
return ArrayUtil.toIntList(equalityIds);
446458
}
447459

460+
@Override
461+
public List<Integer> partialFieldIds() {
462+
return ArrayUtil.toIntList(partialIds);
463+
}
464+
448465
@Override
449466
public Integer sortOrderId() {
450467
return sortOrderId;
@@ -478,6 +495,7 @@ public String toString() {
478495
.add("split_offsets", splitOffsets == null ? "null" : splitOffsets())
479496
.add("equality_ids", equalityIds == null ? "null" : equalityFieldIds())
480497
.add("sort_order_id", sortOrderId)
498+
.add("partial_ids", equalityIds == null ? "null" : partialFieldIds())
481499
.toString();
482500
}
483501
}

core/src/main/java/org/apache/iceberg/DeleteFileIndex.java

+47
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,37 @@ private static boolean canContainDeletesForFile(
156156

157157
case EQUALITY_DELETES:
158158
return canContainEqDeletesForFile(dataFile, deleteFile, schema);
159+
160+
case PARTIAL_UPDATE:
161+
return canContainPartialDeletesForFile(dataFile, deleteFile, schema);
162+
}
163+
164+
return true;
165+
}
166+
167+
// todo: add actual implementation
168+
private static boolean canContainPartialDeletesForFile(
169+
DataFile dataFile, DeleteFile deleteFile, Schema schema) {
170+
// check that the delete file can contain the data file's file_path
171+
Map<Integer, ByteBuffer> lowers = deleteFile.lowerBounds();
172+
Map<Integer, ByteBuffer> uppers = deleteFile.upperBounds();
173+
if (lowers == null || uppers == null) {
174+
return true;
175+
}
176+
177+
Type pathType = MetadataColumns.DELETE_FILE_PATH.type();
178+
int pathId = MetadataColumns.DELETE_FILE_PATH.fieldId();
179+
Comparator<CharSequence> comparator = Comparators.charSequences();
180+
ByteBuffer lower = lowers.get(pathId);
181+
if (lower != null
182+
&& comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, lower)) < 0) {
183+
return false;
184+
}
185+
186+
ByteBuffer upper = uppers.get(pathId);
187+
if (upper != null
188+
&& comparator.compare(dataFile.path(), Conversions.fromByteBuffer(pathType, upper)) > 0) {
189+
return false;
159190
}
160191

161192
return true;
@@ -474,6 +505,22 @@ DeleteFileIndex build() {
474505
globalApplySeqs = eqFilesSortedBySeq.stream().mapToLong(Pair::first).toArray();
475506
globalDeletes = eqFilesSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new);
476507

508+
// fixme: this will overlap equal deletes
509+
List<Pair<Long, DeleteFile>> partialDeleteSortedBySeq =
510+
deleteFilesByPartition.get(partition).stream()
511+
.filter(entry -> entry.file().content() == FileContent.PARTIAL_UPDATE)
512+
.map(
513+
entry ->
514+
// a delete file is indexed by the sequence number it should be applied to
515+
Pair.of(entry.dataSequenceNumber(), entry.file()))
516+
.sorted(Comparator.comparingLong(Pair::first))
517+
.collect(Collectors.toList());
518+
if (partialDeleteSortedBySeq.size() > 0) {
519+
globalApplySeqs = partialDeleteSortedBySeq.stream().mapToLong(Pair::first).toArray();
520+
globalDeletes =
521+
partialDeleteSortedBySeq.stream().map(Pair::second).toArray(DeleteFile[]::new);
522+
}
523+
477524
List<Pair<Long, DeleteFile>> posFilesSortedBySeq =
478525
deleteFilesByPartition.get(partition).stream()
479526
.filter(entry -> entry.file().content() == FileContent.POSITION_DELETES)

core/src/main/java/org/apache/iceberg/FileMetadata.java

+11
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ public static class Builder {
4141
private final int specId;
4242
private FileContent content = null;
4343
private int[] equalityFieldIds = null;
44+
private int[] partialFieldIds = null;
4445
private PartitionData partitionData;
4546
private String filePath = null;
4647
private FileFormat format = null;
@@ -116,6 +117,13 @@ public Builder ofEqualityDeletes(int... fieldIds) {
116117
return this;
117118
}
118119

120+
public Builder ofPartialDeletes(int[] newEqualityFieldIds, int[] newPartialFieldIds) {
121+
this.content = FileContent.PARTIAL_UPDATE;
122+
this.equalityFieldIds = newEqualityFieldIds;
123+
this.partialFieldIds = newPartialFieldIds;
124+
return this;
125+
}
126+
119127
public Builder withStatus(FileStatus stat) {
120128
this.filePath = stat.getPath().toString();
121129
this.fileSizeInBytes = stat.getLen();
@@ -222,6 +230,8 @@ public DeleteFile build() {
222230
sortOrderId == null, "Position delete file should not have sort order");
223231
break;
224232
case EQUALITY_DELETES:
233+
234+
case PARTIAL_UPDATE:
225235
if (sortOrderId == null) {
226236
sortOrderId = SortOrder.unsorted().orderId();
227237
}
@@ -246,6 +256,7 @@ public DeleteFile build() {
246256
lowerBounds,
247257
upperBounds),
248258
equalityFieldIds,
259+
partialFieldIds,
249260
sortOrderId,
250261
keyMetadata);
251262
}

core/src/main/java/org/apache/iceberg/GenericDataFile.java

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ class GenericDataFile extends BaseFile<DataFile> implements DataFile {
5757
metrics.upperBounds(),
5858
splitOffsets,
5959
null,
60+
null,
6061
sortOrderId,
6162
keyMetadata);
6263
}

core/src/main/java/org/apache/iceberg/GenericDeleteFile.java

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class GenericDeleteFile extends BaseFile<DeleteFile> implements DeleteFile {
3939
long fileSizeInBytes,
4040
Metrics metrics,
4141
int[] equalityFieldIds,
42+
int[] partialFieldIds,
4243
Integer sortOrderId,
4344
ByteBuffer keyMetadata) {
4445
super(
@@ -57,6 +58,7 @@ class GenericDeleteFile extends BaseFile<DeleteFile> implements DeleteFile {
5758
metrics.upperBounds(),
5859
null,
5960
equalityFieldIds,
61+
partialFieldIds,
6062
sortOrderId,
6163
keyMetadata);
6264
}

core/src/main/java/org/apache/iceberg/SnapshotSummary.java

+8
Original file line numberDiff line numberDiff line change
@@ -221,12 +221,14 @@ private static class UpdateMetrics {
221221
private int addedPosDeleteFiles = 0;
222222
private int removedPosDeleteFiles = 0;
223223
private int addedDeleteFiles = 0;
224+
private int addedPartialFiles = 0;
224225
private int removedDeleteFiles = 0;
225226
private long addedRecords = 0L;
226227
private long deletedRecords = 0L;
227228
private long addedPosDeletes = 0L;
228229
private long removedPosDeletes = 0L;
229230
private long addedEqDeletes = 0L;
231+
private long addedPartialUpdates = 0L;
230232
private long removedEqDeletes = 0L;
231233
private boolean trustSizeAndDeleteCounts = true;
232234

@@ -290,6 +292,12 @@ void addedFile(ContentFile<?> file) {
290292
this.addedEqDeleteFiles += 1;
291293
this.addedEqDeletes += file.recordCount();
292294
break;
295+
case PARTIAL_UPDATE:
296+
this.addedDeleteFiles += 1;
297+
this.addedPartialFiles += 1;
298+
this.addedPartialUpdates += file.recordCount();
299+
break;
300+
293301
default:
294302
throw new UnsupportedOperationException(
295303
"Unsupported file content type: " + file.content());

core/src/main/java/org/apache/iceberg/V2Metadata.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,8 @@ static Types.StructType fileType(Types.StructType partitionType) {
272272
DataFile.KEY_METADATA,
273273
DataFile.SPLIT_OFFSETS,
274274
DataFile.EQUALITY_IDS,
275-
DataFile.SORT_ORDER_ID);
275+
DataFile.SORT_ORDER_ID,
276+
DataFile.PARTIAL_IDS);
276277
}
277278

278279
static class IndexedManifestEntry<F extends ContentFile<F>>
@@ -456,6 +457,8 @@ public Object get(int pos) {
456457
return wrapped.equalityFieldIds();
457458
case 15:
458459
return wrapped.sortOrderId();
460+
case 16:
461+
return wrapped.partialFieldIds();
459462
}
460463
throw new IllegalArgumentException("Unknown field ordinal: " + pos);
461464
}
@@ -550,6 +553,11 @@ public List<Integer> equalityFieldIds() {
550553
return wrapped.equalityFieldIds();
551554
}
552555

556+
@Override
557+
public List<Integer> partialFieldIds() {
558+
return wrapped.partialFieldIds();
559+
}
560+
553561
@Override
554562
public Integer sortOrderId() {
555563
return wrapped.sortOrderId();

0 commit comments

Comments
 (0)