Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cloud/filestore/config/filesystem.proto
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,7 @@ message TFileSystemConfig

// Enable FUSE_HANDLE_KILLPRIV_V2 on guest (FUSE client)
optional bool GuestHandleKillPrivV2Enabled = 29;

// Enable zero-copy for read operations in the VFS FUSE layer
optional bool ZeroCopyReadEnabled = 30;
}
3 changes: 3 additions & 0 deletions cloud/filestore/config/server.proto
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,9 @@ message TLocalServiceConfig

// Enable FUSE_HANDLE_KILLPRIV_V2 on guest (FUSE client)
optional bool GuestHandleKillPrivV2Enabled = 36;

// Enable zero-copy for read operations in the VFS FUSE layer
optional bool ZeroCopyReadEnabled = 37;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 3 additions & 0 deletions cloud/filestore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -656,6 +656,9 @@ message TStorageConfig
// prototxt.
optional bool UseBinaryFormatForTabletBootInfoBackup = 442;

// Enable zero-copy for read operations in the VFS FUSE layer
optional bool ZeroCopyReadEnabled = 443;

// Allows to set node's priority for system (tenant) tablets (hive,
// schemeshard, etc.). Lower value means lower priority.
// NOTE: can be negative.
Expand Down
1 change: 0 additions & 1 deletion cloud/filestore/libs/service_local/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ namespace {
xxx(MaxResponseEntries, ui32, 10000 )\
xxx(MaxBackground, ui32, 0 )\
xxx(MaxFuseLoopThreads, ui32, 1 )\
xxx(ZeroCopyWriteEnabled, bool, false )\
xxx(FSyncQueueDisabled, bool, false )\
xxx(EntryTimeout, TDuration, TDuration::Seconds(15) )\
xxx(NegativeEntryTimeout, TDuration, TDuration::Zero() )\
Expand Down
2 changes: 0 additions & 2 deletions cloud/filestore/libs/service_local/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ class TLocalFileStoreConfig

ui32 GetMaxFuseLoopThreads() const;

bool GetZeroCopyWriteEnabled() const;

bool GetFSyncQueueDisabled() const;

TDuration GetEntryTimeout() const;
Expand Down
2 changes: 2 additions & 0 deletions cloud/filestore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ using TAliases = NProto::TStorageConfig::TFilestoreAliases;
xxx(DirectoryHandlesTableSize, ui64, 100'000 )\
xxx(GuestHandleKillPrivV2Enabled, bool, false )\
xxx(AllowAdditionalSystemTablets, bool, false )\
\
xxx(ZeroCopyReadEnabled, bool, false )\
// FILESTORE_STORAGE_CONFIG

#define FILESTORE_STORAGE_CONFIG_REF(xxx) \
Expand Down
2 changes: 2 additions & 0 deletions cloud/filestore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ class TStorageConfig
bool GetGuestHandleKillPrivV2Enabled() const;

[[nodiscard]] bool GetAllowAdditionalSystemTablets() const;

bool GetZeroCopyReadEnabled() const;
};

} // namespace NCloud::NFileStore::NStorage
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ void FillFeatures(

features->SetGuestHandleKillPrivV2Enabled(
config.GetGuestHandleKillPrivV2Enabled());

features->SetZeroCopyReadEnabled(config.GetZeroCopyReadEnabled());
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
2 changes: 2 additions & 0 deletions cloud/filestore/libs/storage/tablet/tablet_ut_sessions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,7 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Sessions)
config.SetAllowHandlelessIO(true);
config.SetZeroCopyWriteEnabled(true);
config.SetGuestHandleKillPrivV2Enabled(true);
config.SetZeroCopyReadEnabled(true);

features.SetTwoStageReadEnabled(true);
features.SetEntryTimeout(TDuration::Seconds(10).MilliSeconds());
Expand All @@ -868,6 +869,7 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Sessions)
features.SetAllowHandlelessIO(true);
features.SetZeroCopyWriteEnabled(true);
features.SetGuestHandleKillPrivV2Enabled(true);
features.SetZeroCopyReadEnabled(true);

DoTestShouldReturnFeaturesInCreateSessionResponse(config, features);
}
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/libs/vfs_fuse/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace {
xxx(ZeroCopyWriteEnabled, bool, false )\
xxx(FSyncQueueDisabled, bool, false )\
xxx(GuestHandleKillPrivV2Enabled, bool, false )\
xxx(ZeroCopyReadEnabled, bool, false )\
// FILESTORE_FUSE_CONFIG

#define FILESTORE_FILESYSTEM_DECLARE_CONFIG(name, type, value) \
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/libs/vfs_fuse/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct TFileSystemConfig
ui32 GetMaxFuseLoopThreads() const;

bool GetZeroCopyWriteEnabled() const;
bool GetZeroCopyReadEnabled() const;

bool GetFSyncQueueDisabled() const;

Expand Down
76 changes: 69 additions & 7 deletions cloud/filestore/libs/vfs_fuse/fs_impl_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,56 @@ void TFileSystem::Read(
request->SetOffset(offset);
request->SetLength(size);

if (Config->GetZeroCopyReadEnabled()) {
// TODO(issue-4800): Support ZeroCopyReadEnabled for local filestore
struct iovec* iov = nullptr;
int count = 0;
int ret = fuse_out_buf(req, &iov, &count);
if (ret == -1 || count <= 1) {
STORAGE_ERROR(
"Invalid fuse out buffers, ret=%d, count=%d",
ret,
count);
ReplyError(
*callContext,
MakeError(E_FS_INVAL, "Invalid fuse out buffers"),
req,
EINVAL);
return;
}

auto* iovecs = request->MutableIovecs();
iovecs->Reserve(count);

size_t remainingSize = request->GetLength();
// skip first fuse out iovec where headers are kept rest of the iovecs
// contain pointers to data buffers
for (int index = 1; index < count; index++) {
if (remainingSize == 0) {
break;
}
auto dataSize = std::min(remainingSize, iov[index].iov_len);
auto* iovec = iovecs->Add();
iovec->SetBase(reinterpret_cast<ui64>(iov[index].iov_base));
iovec->SetLength(dataSize);
remainingSize -= dataSize;
}

if (remainingSize != 0) {
STORAGE_WARN(
"Read request length exceeds fuse buffer space, remainingSize="
<< remainingSize);
ReplyError(
*callContext,
MakeError(
E_FS_INVAL,
"request length exceeds fuse buffer space"),
req,
EINVAL);
return;
}
}

TFuture<NProto::TReadDataResponse> future;
if (WriteBackCache) {
future = WriteBackCache.ReadData(callContext, std::move(request));
Expand All @@ -368,14 +418,26 @@ void TFileSystem::Read(

const auto& response = future.GetValue();
if (CheckResponse(self, *callContext, req, response)) {
// Depending on the configuration of the filestore, data may still
// be returned as a Buffer even when I/O vectors are provided in the
// request.
const auto& buffer = response.GetBuffer();
ui32 bufferOffset = response.GetBufferOffset();
self->ReplyBuf(
*callContext,
response.GetError(),
req,
buffer.data() + bufferOffset,
buffer.size() - bufferOffset);
if (buffer.empty()) {
self->ReplyBuf(
*callContext,
response.GetError(),
req,
nullptr,
response.GetLength());
} else {
ui32 bufferOffset = response.GetBufferOffset();
self->ReplyBuf(
*callContext,
response.GetError(),
req,
buffer.data() + bufferOffset,
buffer.size() - bufferOffset);
}
}
});
}
Expand Down
105 changes: 105 additions & 0 deletions cloud/filestore/libs/vfs_fuse/fs_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@ TString CreateBuffer(size_t len, char fill = 0)
return TString(len, fill);
}

TString GenerateValidateData(ui32 size, ui32 seed = 0)
{
TString data(size, 0);
for (ui32 i = 0; i < size; ++i) {
data[i] = 'A' + ((i + seed) % ('Z' - 'A' + 1));
}
return data;
}

template <class F>
bool WaitForCondition(TDuration timeout, F&& predicate)
{
Expand Down Expand Up @@ -2953,6 +2962,102 @@ Y_UNIT_TEST_SUITE(TFileSystemTest)
UNIT_ASSERT_VALUES_EQUAL(requestCount, requestCountSensor->GetAtomic());
UNIT_ASSERT(!path.Exists());
}

Y_UNIT_TEST(ShouldHandleZeroCopyReadRequest)
{
NProto::TFileStoreFeatures features;
features.SetZeroCopyReadEnabled(true);

TBootstrap bootstrap(
CreateWallClockTimer(),
CreateScheduler(),
features);

const ui64 nodeId = 123;
const ui64 handleId = 456;
const ui64 size = 30;
const auto data = GenerateValidateData(size, 2);

bootstrap.Service->ReadDataHandler = [&](auto callContext, auto request)
{
UNIT_ASSERT_VALUES_EQUAL(FileSystemId, callContext->FileSystemId);
UNIT_ASSERT_VALUES_EQUAL(request->GetHandle(), handleId);
auto& iovecs = request->GetIovecs();
UNIT_ASSERT_EQUAL(1, iovecs.size());
UNIT_ASSERT_EQUAL(request->GetLength(), iovecs[0].GetLength());

NProto::TReadDataResponse result;
memcpy(
reinterpret_cast<void*>(iovecs[0].GetBase()),
data.data(),
iovecs[0].GetLength());
result.SetLength(iovecs[0].GetLength());

return MakeFuture(result);
};

bootstrap.Start();
Y_DEFER
{
bootstrap.Stop();
};

auto request =
std::make_shared<TReadRequest>(nodeId, handleId, 0, size);
auto read = bootstrap.Fuse->SendRequest<TReadRequest>(request);

UNIT_ASSERT(read.Wait(WaitTimeout));
UNIT_ASSERT_VALUES_EQUAL(read.GetValue(), size);
UNIT_ASSERT_VALUES_EQUAL(
data,
TString(reinterpret_cast<char*>(&request->Out->Body), size));
}

Y_UNIT_TEST(ShouldHandleZeroCopyReadRequestFallbackToBuffer)
{
NProto::TFileStoreFeatures features;
features.SetZeroCopyReadEnabled(true);

TBootstrap bootstrap(
CreateWallClockTimer(),
CreateScheduler(),
features);

const ui64 nodeId = 123;
const ui64 handleId = 456;
const ui64 size = 105;
const auto data = GenerateValidateData(size, 1);

bootstrap.Service->ReadDataHandler = [&](auto callContext, auto request)
{
UNIT_ASSERT_VALUES_EQUAL(FileSystemId, callContext->FileSystemId);
UNIT_ASSERT_VALUES_EQUAL(request->GetHandle(), handleId);
auto& iovecs = request->GetIovecs();
UNIT_ASSERT_EQUAL(1, iovecs.size());
UNIT_ASSERT_EQUAL(request->GetLength(), iovecs[0].GetLength());

NProto::TReadDataResponse result;
result.MutableBuffer()->assign(data);

return MakeFuture(result);
};

bootstrap.Start();
Y_DEFER
{
bootstrap.Stop();
};

auto request =
std::make_shared<TReadRequest>(nodeId, handleId, 0, size);
auto read = bootstrap.Fuse->SendRequest<TReadRequest>(request);

UNIT_ASSERT(read.Wait(WaitTimeout));
UNIT_ASSERT_VALUES_EQUAL(read.GetValue(), size);
UNIT_ASSERT_VALUES_EQUAL(
data,
TString(reinterpret_cast<char*>(&request->Out->Body), size));
}
}

} // namespace NCloud::NFileStore::NFuse
1 change: 1 addition & 0 deletions cloud/filestore/libs/vfs_fuse/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,7 @@ class TFileSystemLoop final
config.SetMaxFuseLoopThreads(features.GetMaxFuseLoopThreads());

config.SetZeroCopyWriteEnabled(features.GetZeroCopyWriteEnabled());
config.SetZeroCopyReadEnabled(features.GetZeroCopyReadEnabled());

config.SetFSyncQueueDisabled(features.GetFSyncQueueDisabled());

Expand Down
20 changes: 16 additions & 4 deletions cloud/filestore/libs/vhost/client.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,13 +86,25 @@ class TFuseVirtioClient
template <typename T>
void SendRequestImpl(std::shared_ptr<T> request)
{
TVector<char> inData(request->In->Header.len);
memcpy(inData.data(), reinterpret_cast<char*>(&*request->In), inData.size());
TVector<char> outData(request->Out->Header.len);
TVector<TVector<char>> inData{TVector<char>(request->In->Header.len)};
memcpy(
inData[0].data(),
reinterpret_cast<char*>(&*request->In),
inData[0].size());
constexpr auto headerSize = sizeof(request->Out->Header);
TVector<TVector<char>> outData{TVector<char>(headerSize)};

if (request->Out->Header.len > headerSize) {
outData.emplace_back(request->Out->Header.len - headerSize);
}

bool result = Client->Write(inData, outData);
if (result) {
memcpy(reinterpret_cast<char*>(&*request->Out), outData.data(), outData.size());
char* dest = reinterpret_cast<char*>(&*request->Out);
for (const auto& data: outData) {
memcpy(dest, data.data(), data.size());
dest += data.size();
}
}
request->OnCompletion();
}
Expand Down
3 changes: 3 additions & 0 deletions cloud/filestore/public/api/protos/data.proto
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ message TReadDataRequest
// Vector I/O data structure
// Data can be passed via Iovecs instead of Buffer to avoid excess copy
// Caller should ensure that total Length of Iovecs is at least Length

// Depending on the configuration of the filestore, data may still be
// returned as a Buffer even when I/O vectors are provided in the request.
repeated TIovec Iovecs = 7;

// For vector I/O, the caller can pass an identifier. Its interpretation
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/public/api/protos/fs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ message TFileStoreFeatures
bool DirectoryHandlesStorageEnabled = 29;
uint64 DirectoryHandlesTableSize = 30;
bool GuestHandleKillPrivV2Enabled = 31;
bool ZeroCopyReadEnabled = 32;
}

message TFileStore
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ MaxBackground: 16384
MaxFuseLoopThreads: 4
StrictFileSystemSizeEnforcementEnabled: true
ZeroCopyWriteEnabled: true
ZeroCopyReadEnabled: true
GuestKeepCacheAllowed: true
SessionHandleOffloadedStatsCapacity: 100
GuestCachingType: GCT_ANY_READ
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
ZeroCopyWriteEnabled: true
ZeroCopyReadEnabled: true
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ TEST_SRCS(

SET(
NFS_STORAGE_CONFIG_PATCH
cloud/filestore/tests/fio/qemu-kikimr-zero-copy-write-fallback-test/nfs-patch.txt
cloud/filestore/tests/fio/qemu-kikimr-zero-copy-fallback-test/nfs-patch.txt
)

SET(QEMU_VIRTIO fs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ ZeroCopyWriteEnabled: true
ThreeStageWriteEnabled: true
UnalignedThreeStageWriteEnabled: true
MultiTabletForwardingEnabled: true
ZeroCopyReadEnabled: true
TwoStageReadEnabled: true
Loading
Loading