Skip to content

[Kernel]Remove two file IO in CRC loading by reusing log listing #4112

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 64 commits into from
May 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
dd75310
remove additional listing
huan233usc Feb 1, 2025
866c4f7
fix case log/checkpoint get deleted in iterator
huan233usc Feb 1, 2025
3dcd3c9
fix bug of case
huan233usc Feb 1, 2025
b0b3b45
merge conflict
huan233usc Feb 12, 2025
b9df3f1
fix test
huan233usc Feb 12, 2025
5823109
fix test
huan233usc Feb 12, 2025
9c13890
Merge branch 'master' into remove-listing
huan233usc Feb 13, 2025
96c4ff6
flat map
huan233usc Feb 13, 2025
aef6f9b
rename some vars
huan233usc Feb 14, 2025
59327e0
resolve
huan233usc Feb 14, 2025
c70c436
merge
huan233usc Mar 24, 2025
18ad7f9
scala
huan233usc Mar 27, 2025
3a64ab8
fix
huan233usc Mar 27, 2025
d6bb0ae
fix
huan233usc Mar 27, 2025
ae5dad7
fix
huan233usc Mar 27, 2025
13dc17c
fix
huan233usc Mar 27, 2025
ab2eabb
fix
huan233usc Mar 27, 2025
a7b9ae8
fix
huan233usc Mar 27, 2025
a1893c2
fix
huan233usc Mar 27, 2025
a421655
fix
huan233usc Mar 27, 2025
1ceda70
fix
huan233usc Mar 27, 2025
403c762
fix
huan233usc Mar 27, 2025
db36277
merge
huan233usc Apr 17, 2025
6cf3387
fix
huan233usc Apr 18, 2025
bfb62e2
fix
huan233usc Apr 18, 2025
f746c33
fix
huan233usc Apr 18, 2025
cdc33a2
fix
huan233usc Apr 18, 2025
f2eab3c
fix
huan233usc Apr 18, 2025
58c0548
fix doc
huan233usc Apr 21, 2025
bcc45c0
fix java
huan233usc Apr 22, 2025
fb9cffb
merge
huan233usc Apr 23, 2025
f06b098
update tests
huan233usc Apr 23, 2025
bd374ca
add docs
huan233usc Apr 23, 2025
ae47dc6
address comments
huan233usc Apr 24, 2025
8d312e5
address comments
huan233usc Apr 24, 2025
1333013
fix comments
huan233usc Apr 24, 2025
96e3348
fix doc
huan233usc Apr 24, 2025
1c290d6
fix doc
huan233usc Apr 24, 2025
9062b9f
fix
huan233usc Apr 25, 2025
30c9219
fix
huan233usc Apr 25, 2025
f46492f
fix
huan233usc Apr 25, 2025
b78f849
address comments
huan233usc Apr 28, 2025
c7f17dd
fix doc
huan233usc Apr 28, 2025
e0e0752
fix doc
huan233usc Apr 28, 2025
3bb4b3f
private class
huan233usc Apr 28, 2025
f9e6568
merge
huan233usc Apr 28, 2025
7e199a4
merge
huan233usc Apr 28, 2025
90b69ac
doc
huan233usc Apr 28, 2025
fc19e9f
fix
huan233usc Apr 29, 2025
f217fd7
fix
huan233usc Apr 29, 2025
5234983
fix
huan233usc Apr 29, 2025
e482f4b
fix
huan233usc Apr 29, 2025
4f90d83
merge
huan233usc May 1, 2025
ba5e9a5
fix
huan233usc May 1, 2025
57ad1a4
fix
huan233usc May 1, 2025
2f4c231
merge
huan233usc May 1, 2025
3e1bdd7
fix
huan233usc May 1, 2025
ef92e34
fix
huan233usc May 1, 2025
4e2a011
fix comments
huan233usc May 1, 2025
f692b6a
fix typo
huan233usc May 1, 2025
ab53195
fmt
huan233usc May 1, 2025
580a744
fmt
huan233usc May 1, 2025
86f4211
fmt
huan233usc May 1, 2025
b2efd40
update test
huan233usc May 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,10 @@ public static CloseableIterator<FileStatus> listDeltaLogFilesAsIter(
startVersion,
endVersionOpt);

// This variable is used to help determine if we should throw an error if the table history is
// not reconstructable. Only commit and checkpoint files are applicable.
// Must be final to be used in lambda
final AtomicBoolean hasReturnedAnElement = new AtomicBoolean(false);
final AtomicBoolean hasReturnedCommitOrCheckpoint = new AtomicBoolean(false);

return listLogDir(engine, tablePath, startVersion)
.breakableFilter(
Expand All @@ -237,6 +239,9 @@ public static CloseableIterator<FileStatus> listDeltaLogFilesAsIter(
// Checkpoint files of 0 size are invalid but may be ignored silently when read,
// hence we ignore them so that we never pick up such checkpoints.
// Here, we do nothing (we will consume this file).
} else if (fileTypes.contains(DeltaLogFileType.CHECKSUM)
&& FileNames.isChecksumFile(fileName)) {
// Here, we do nothing (we will consume this file).
} else {
logger.debug("Ignoring file {} as it is not of the desired type", fs.getPath());
return BreakableFilterResult.EXCLUDE; // Here, we exclude and filter out this file.
Expand Down Expand Up @@ -277,7 +282,7 @@ public static CloseableIterator<FileStatus> listDeltaLogFilesAsIter(
final long endVersion = endVersionOpt.get();

if (fileVersion > endVersion) {
if (mustBeRecreatable && !hasReturnedAnElement.get()) {
if (mustBeRecreatable && !hasReturnedCommitOrCheckpoint.get()) {
final long earliestVersion =
DeltaHistoryManager.getEarliestRecreatableCommit(engine, logPath);
throw DeltaErrors.versionBeforeFirstAvailableCommit(
Expand All @@ -292,7 +297,11 @@ public static CloseableIterator<FileStatus> listDeltaLogFilesAsIter(
}
}

hasReturnedAnElement.set(true);
if (FileNames.isCommitFile(fileName)
|| FileNames.isCheckpointFile(fileName)
|| FileNames.isLogCompactionFile(fileName)) {
hasReturnedCommitOrCheckpoint.set(true);
}

return BreakableFilterResult.INCLUDE;
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@

import static io.delta.kernel.internal.util.FileNames.*;
import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator;
import static java.lang.Math.min;

import io.delta.kernel.data.ColumnarBatch;
import io.delta.kernel.engine.Engine;
import io.delta.kernel.internal.fs.Path;
import io.delta.kernel.internal.util.FileNames;
import io.delta.kernel.utils.CloseableIterator;
import io.delta.kernel.utils.FileStatus;
import java.io.IOException;
import java.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -35,89 +33,41 @@ public class ChecksumReader {
private static final Logger logger = LoggerFactory.getLogger(ChecksumReader.class);

/**
* Load the CRCInfo from the checksum file at the given version. If the checksum file is not found
* at the given version, it will try to find the latest checksum file that is created at or after
* the lower bound version.
* Load the CRCInfo from the provided checksum file.
*
* @param engine the engine to use for reading the checksum file
* @param logPath the path to the Delta log
* @param targetedVersion the target version to read the checksum file from
* @param lowerBound the inclusive lower bound version to search for the checksum file
* @return Optional {@link CRCInfo} containing the protocol and metadata, and the version of the
* checksum file. If the checksum file is not found, it will return an empty
* @param checkSumFile the file status of the checksum file to read
* @return Optional {@link CRCInfo} containing the information included in the checksum file, such
* as protocol, metadata.
*/
public static Optional<CRCInfo> getCRCInfo(
Engine engine, Path logPath, long targetedVersion, long lowerBound) {
// lower bound should always smaller than the targetedVersion.
lowerBound = min(lowerBound, targetedVersion);
logger.info("Loading CRC file for version {} with lower bound {}", targetedVersion, lowerBound);
// First try to load the CRC at given version. If not found or failed to read then try to
// find the latest CRC file that is created at or after the lower bound version.
Path crcFilePath = checksumFile(logPath, targetedVersion);
Optional<CRCInfo> crcInfoOpt = readChecksumFile(engine, crcFilePath);
if (crcInfoOpt.isPresent()
||
// we don't expect any more checksum files as it is the first version
targetedVersion == 0
|| targetedVersion == lowerBound) {
return crcInfoOpt;
}
logger.info(
"CRC file for version {} not found, listing CRC files from version {}",
targetedVersion,
lowerBound);

Path lowerBoundFilePath = checksumFile(logPath, lowerBound);
try (CloseableIterator<FileStatus> crcFiles =
engine.getFileSystemClient().listFrom(lowerBoundFilePath.toString())) {
List<FileStatus> crcFilesList =
crcFiles
.filter(file -> isChecksumFile(file.getPath()))
.takeWhile(file -> checksumVersion(new Path(file.getPath())) <= targetedVersion)
.toInMemoryList();

// pick the last file which is the latest version that has the CRC file
if (crcFilesList.isEmpty()) {
logger.warn("No checksum files found in the range {} to {}", lowerBound, targetedVersion);
return Optional.empty();
}

FileStatus latestCRCFile = crcFilesList.get(crcFilesList.size() - 1);
return readChecksumFile(engine, new Path(latestCRCFile.getPath()));
} catch (IOException e) {
logger.warn("Failed to list checksum files from {}", lowerBoundFilePath, e);
return Optional.empty();
}
}

private static Optional<CRCInfo> readChecksumFile(Engine engine, Path filePath) {
public static Optional<CRCInfo> getCRCInfo(Engine engine, FileStatus checkSumFile) {
try (CloseableIterator<ColumnarBatch> iter =
engine
.getJsonHandler()
.readJsonFiles(
singletonCloseableIterator(FileStatus.of(filePath.toString())),
singletonCloseableIterator(checkSumFile),
CRCInfo.CRC_FILE_SCHEMA,
Optional.empty())) {
// We do this instead of iterating through the rows or using `getSingularRow` so we
// can use the existing fromColumnVector methods in Protocol, Metadata, Format etc
if (!iter.hasNext()) {
logger.warn("Checksum file is empty: {}", filePath);
logger.warn("Checksum file is empty: {}", checkSumFile.getPath());
return Optional.empty();
}

ColumnarBatch batch = iter.next();
if (batch.getSize() != 1) {
String msg = "Expected exactly one row in the checksum file {}, found {} rows";
logger.warn(msg, filePath, batch.getSize());
logger.warn(msg, checkSumFile.getPath(), batch.getSize());
return Optional.empty();
}

long crcVersion = FileNames.checksumVersion(filePath);
long crcVersion = FileNames.checksumVersion(new Path(checkSumFile.getPath()));

return CRCInfo.fromColumnarBatch(crcVersion, batch, 0 /* rowId */, filePath.toString());
return CRCInfo.fromColumnarBatch(crcVersion, batch, 0 /* rowId */, checkSumFile.getPath());
} catch (Exception e) {
// This can happen when the version does not have a checksum file
logger.warn("Failed to read checksum file {}", filePath, e);
logger.warn("Failed to read checksum file {}", checkSumFile.getPath(), e);
return Optional.empty();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ public void writeLogCompactionFile(Engine engine) throws IOException {
deltas,
Collections.emptyList(),
Collections.emptyList(),
Optional.empty(),
lastCommitTimestamp);
CreateCheckpointIterator checkpointIterator =
new CreateCheckpointIterator(engine, segment, minFileRetentionTimestampMillis);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import static io.delta.kernel.internal.DeltaErrors.wrapEngineExceptionThrowsIO;
import static io.delta.kernel.internal.TableConfig.IN_COMMIT_TIMESTAMPS_ENABLED;
import static io.delta.kernel.internal.actions.SingleAction.*;
import static io.delta.kernel.internal.util.FileNames.checksumFile;
import static io.delta.kernel.internal.util.FileNames.deltaFile;
import static io.delta.kernel.internal.util.Preconditions.checkArgument;
import static io.delta.kernel.internal.util.Preconditions.checkState;
Expand Down Expand Up @@ -175,7 +176,8 @@ public TransactionRebaseState resolveConflicts(Engine engine) throws ConcurrentW

Optional<CRCInfo> updatedCrcInfo =
ChecksumReader.getCRCInfo(
engine, snapshot.getLogPath(), lastWinningVersion, lastWinningVersion);
engine,
FileStatus.of(checksumFile(snapshot.getLogPath(), lastWinningVersion).toString()));

// if we get here, we have successfully rebased (i.e no logical conflicts)
// against the winning transactions
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@

import static io.delta.kernel.internal.replay.LogReplayUtils.assertLogFilesBelongToTable;
import static io.delta.kernel.internal.util.Preconditions.checkArgument;
import static java.util.Arrays.asList;
import static java.util.Collections.max;
import static java.util.Objects.requireNonNull;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.data.ColumnarBatch;
Expand All @@ -38,6 +37,7 @@
import io.delta.kernel.internal.snapshot.SnapshotHint;
import io.delta.kernel.internal.tablefeatures.TableFeatures;
import io.delta.kernel.internal.util.DomainMetadataUtils;
import io.delta.kernel.internal.util.FileNames;
import io.delta.kernel.internal.util.Tuple2;
import io.delta.kernel.types.StringType;
import io.delta.kernel.types.StructType;
Expand Down Expand Up @@ -135,7 +135,7 @@ public static StructType getAddReadSchema(boolean shouldReadStats) {
private final LogSegment logSegment;
private final Tuple2<Protocol, Metadata> protocolAndMetadata;
private final Lazy<Map<String, DomainMetadata>> domainMetadataMap;
private final Optional<CRCInfo> currentCrcInfo;
private final CrcInfoContext crcInfoContext;

public LogReplay(
Path logPath,
Expand All @@ -147,17 +147,23 @@ public LogReplay(
SnapshotMetrics snapshotMetrics) {

assertLogFilesBelongToTable(logPath, logSegment.allLogFilesUnsorted());
Tuple2<Optional<SnapshotHint>, Optional<CRCInfo>> newerSnapshotHintAndCurrentCrcInfo =
maybeGetNewerSnapshotHintAndCurrentCrcInfo(
engine, logSegment, snapshotHint, snapshotVersion);
this.currentCrcInfo = newerSnapshotHintAndCurrentCrcInfo._2;

// Ignore the snapshot hint whose version is larger than the snapshot version.
if (snapshotHint.isPresent() && snapshotHint.get().getVersion() > snapshotVersion) {
snapshotHint = Optional.empty();
}

this.crcInfoContext = new CrcInfoContext(engine);
this.dataPath = dataPath;
this.logSegment = logSegment;
Optional<SnapshotHint> newerSnapshotHint =
crcInfoContext.maybeGetNewerSnapshotHintAndUpdateCache(
engine, logSegment, snapshotHint, snapshotVersion);
this.protocolAndMetadata =
snapshotMetrics.loadInitialDeltaActionsTimer.time(
() ->
loadTableProtocolAndMetadata(
engine, logSegment, newerSnapshotHintAndCurrentCrcInfo._1, snapshotVersion));
engine, logSegment, newerSnapshotHint, snapshotVersion));
// Lazy loading of domain metadata only when needed
this.domainMetadataMap = new Lazy<>(() -> loadDomainMetadataMap(engine));
}
Expand Down Expand Up @@ -186,9 +192,11 @@ public long getVersion() {
return logSegment.getVersion();
}

/** Returns the crc info for the current snapshot if the checksum file is read */
/** Returns the crc info for the current snapshot if it is cached */
public Optional<CRCInfo> getCurrentCrcInfo() {
return currentCrcInfo;
return crcInfoContext
.getLastSeenCrcInfo()
.filter(crcInfo -> crcInfo.getVersion() == getVersion());
}

/**
Expand Down Expand Up @@ -367,6 +375,7 @@ private Optional<Long> loadLatestTransactionVersion(Engine engine, String applic
*/
private Map<String, DomainMetadata> loadDomainMetadataMap(Engine engine) {
// First try to load from CRC info if available
Optional<CRCInfo> currentCrcInfo = getCurrentCrcInfo();
if (currentCrcInfo.isPresent() && currentCrcInfo.get().getDomainMetadata().isPresent()) {
return currentCrcInfo.get().getDomainMetadata().get().stream()
.collect(Collectors.toMap(DomainMetadata::getDomain, Function.identity()));
Expand Down Expand Up @@ -415,50 +424,87 @@ private Map<String, DomainMetadata> loadDomainMetadataMapFromLog(Engine engine)
}

/**
* Calculates the latest snapshot hint before or at the current snapshot version, returns the
* CRCInfo if checksum file at the current version is read
* Encapsulates CRC-related functionality and state for the LogReplay. This includes caching CRC
* info and extracting snapshot hints from CRC files.
*
* <p>This class uses {@code maybeGetNewerSnapshotHintAndUpdateCache} to calculate a {@code
* SnapshotHint} and also exposes a {@code getLastSeenCrcInfo} method. Their relationship is:
*
* <ul>
* <li>We want to find the latest {@code SnapshotHint} to use during log replay for Protocol and
* Metadata loading
* <li>If we are not provided a SnapshotHint for this version, or are provided a stale hint, we
* will try to read the latest seen (by file listing) CRC file (if it exists). If so, we
* read it, cache it, and create a newer hint.
* <li>Then, when {@code getLastSeenCrcInfo} is called, we will either use the cached CRCInfo
* that we have already read, parsed, and cached; or, if it was never cached (because the
* hint was sufficiently new) we will read it, parse it, and cache it for the first time
* </ul>
*/
private Tuple2<Optional<SnapshotHint>, Optional<CRCInfo>>
maybeGetNewerSnapshotHintAndCurrentCrcInfo(
Engine engine,
LogSegment logSegment,
Optional<SnapshotHint> snapshotHint,
long snapshotVersion) {

// Snapshot hint's version is current.
if (snapshotHint.isPresent() && snapshotHint.get().getVersion() == snapshotVersion) {
return new Tuple2<>(snapshotHint, Optional.empty());
private class CrcInfoContext {
private final Engine engine;
private Optional<CRCInfo> cachedLastSeenCrcInfo;

CrcInfoContext(Engine engine) {
this.engine = requireNonNull(engine);
this.cachedLastSeenCrcInfo = Optional.empty();
}

// Ignore the snapshot hint whose version is larger.
if (snapshotHint.isPresent() && snapshotHint.get().getVersion() > snapshotVersion) {
snapshotHint = Optional.empty();
/** Returns the CRC info persisted in the logSegment's lastSeenChecksum File */
public Optional<CRCInfo> getLastSeenCrcInfo() {
if (!cachedLastSeenCrcInfo.isPresent()) {
cachedLastSeenCrcInfo =
logSegment
.getLastSeenChecksum()
.flatMap(crcFile -> ChecksumReader.getCRCInfo(engine, crcFile));
}
return cachedLastSeenCrcInfo;
}

long crcSearchLowerBound =
max(
asList(
// Prefer reading hint over CRC, so start listing from hint's version + 1,
// if hint is not present, list from version 0.
snapshotHint.map(SnapshotHint::getVersion).orElse(-1L) + 1,
logSegment.getCheckpointVersionOpt().orElse(0L),
// Only find the CRC within 100 versions.
snapshotVersion - 100,
0L));
Optional<CRCInfo> crcInfoOpt =
ChecksumReader.getCRCInfo(
engine, logSegment.getLogPath(), snapshotVersion, crcSearchLowerBound);
if (!crcInfoOpt.isPresent()) {
return new Tuple2<>(snapshotHint, Optional.empty());
/**
* Attempts to build a newer snapshot hint from CRC that can be used for loading table state
* more efficiently. When CRC is read, updates the internal cache.
*
* @param engine The engine used to read CRC files
* @param logSegment The log segment containing checksum information
* @param snapshotHint Existing snapshot hint, if any
* @param snapshotVersion Target snapshot version
* @return An updated snapshot hint if a newer CRC file was found, otherwise the original hint
*/
public Optional<SnapshotHint> maybeGetNewerSnapshotHintAndUpdateCache(
Engine engine,
LogSegment logSegment,
Optional<SnapshotHint> snapshotHint,
long snapshotVersion) {

// Snapshot hint's version is current so we could use it in loading P&M.
// No need to read crc.
if (snapshotHint.isPresent() && snapshotHint.get().getVersion() == snapshotVersion) {
return snapshotHint;
}

// Prefer reading hint over CRC to save 1 io, only read crc if it is newer than snapshot hint.
long crcReadLowerBound = snapshotHint.map(SnapshotHint::getVersion).orElse(-1L) + 1;

Optional<CRCInfo> crcInfoOpt =
logSegment
.getLastSeenChecksum()
.filter(
checksum ->
FileNames.getFileVersion(new Path(checksum.getPath())) >= crcReadLowerBound)
.flatMap(checksum -> ChecksumReader.getCRCInfo(engine, checksum));

if (!crcInfoOpt.isPresent()) {
return snapshotHint;
}

CRCInfo crcInfo = crcInfoOpt.get();
this.cachedLastSeenCrcInfo = Optional.of(crcInfo);
checkArgument(
crcInfo.getVersion() >= crcReadLowerBound && crcInfo.getVersion() <= snapshotVersion);
// We found a CRCInfo of a version (a) older than the one we are looking for (snapshotVersion)
// but (b) newer than the current hint. Use this CRCInfo to create a new hint, and return.
return Optional.of(SnapshotHint.fromCrcInfo(crcInfo));
}
CRCInfo crcInfo = crcInfoOpt.get();
checkArgument(
crcInfo.getVersion() >= crcSearchLowerBound && crcInfo.getVersion() <= snapshotVersion);
// We found a CRCInfo of a version (a) older than the one we are looking for (snapshotVersion)
// but (b) newer than the current hint. Use this CRCInfo to create a new hint, and return this
// crc info if it matches the current version.
return new Tuple2<>(
Optional.of(SnapshotHint.fromCrcInfo(crcInfo)),
crcInfo.getVersion() == snapshotVersion ? crcInfoOpt : Optional.empty());
}
}
Loading
Loading