Skip to content

Commit

Permalink
Revamp cache_bench to resemble a real workload (facebook#6629)
Browse files Browse the repository at this point in the history
Summary:
I suspect LRUCache could use some optimization, and to support
such an effort, a good benchmarking tool is needed. The existing
cache_bench was heavily skewed toward insertion and lookup misses, and
did not saturate memory with other work. This change should improve
those things to better resemble a real workload.

(All below using clang compiler, for some consistency, but not
necessarily same version and settings.)

The real workload is from production MySQL on RocksDB, filtering stacks
containing "LRU", "ShardedCache" or "CacheShard."
Lookup inclusive: 66%
Insert inclusive: 17%
Release inclusive: 15%

An alternate simulated workload is MySQL running a LinkBench read test:
Lookup inclusive: 54%
Insert inclusive: 24%
Release inclusive: 21%

cache_bench default settings, prior to this change:
Lookup inclusive: 35.8%
Insert inclusive: 63.6%
Release inclusive: 0%

cache_bench after this change (intended as somewhat "tighter" workload
than average production, more like LinkBench):
Lookup inclusive: 52%
Insert inclusive: 20%
Release inclusive: 26%

And top exclusive stacks (portion of stack samples as filtered above):
Production MySQL:
LRUHandleTable::FindPointer: 25.3%
rocksdb::operator==: 15.1%  <-- Slice ==
LRUCacheShard::LRU_Remove: 13.8%
ShardedCache::Lookup: 8.9%
__pthread_mutex_lock: 7.1%
LRUCacheShard::LRU_Insert: 6.3%
MurmurHash64A: 4.8%  <-- Since upgraded to XXH3p
...

Old cache_bench:
LRUHandleTable::FindPointer: 23.6%
__pthread_mutex_lock: 15.0%
__pthread_mutex_unlock_usercnt: 11.7%
__lll_lock_wait: 8.6%
__lll_unlock_wake: 6.8%
LRUCacheShard::LRU_Insert: 6.0%
ShardedCache::Lookup: 4.4%
LRUCacheShard::LRU_Remove: 2.8%
...
rocksdb::operator==: 0.2%  <-- Slice ==
...

New cache_bench:
LRUHandleTable::FindPointer: 22.8%
__pthread_mutex_unlock_usercnt: 14.3%
rocksdb::operator==: 10.5%  <-- Slice ==
LRUCacheShard::LRU_Insert: 9.0%
__pthread_mutex_lock: 5.9%
LRUCacheShard::LRU_Remove: 5.0%
...
ShardedCache::Lookup: 2.9%
...

So there's a bit more lock contention in the benchmark than in
production, but otherwise looks similar enough to me. At least it's a
big improvement over the existing code.
Pull Request resolved: facebook#6629

Test Plan: No production code changes, ran cache_bench with ASAN

Reviewed By: ltamasi

Differential Revision: D20824318

Pulled By: pdillinger

fbshipit-source-id: 6f8dc5891ead0f87edbed3a615ecd5289d9abe12
  • Loading branch information
pdillinger authored and facebook-github-bot committed Apr 3, 2020
1 parent df62cd5 commit 079e77f
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 70 deletions.
230 changes: 165 additions & 65 deletions cache/cache_bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,63 @@ int main() {
#include <stdio.h>
#include <sys/types.h>
#include <cinttypes>
#include <limits>

#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/coding.h"
#include "util/gflags_compat.h"
#include "util/hash.h"
#include "util/mutexlock.h"
#include "util/random.h"

using GFLAGS_NAMESPACE::ParseCommandLineFlags;

static const uint32_t KB = 1024;

DEFINE_int32(threads, 16, "Number of concurrent threads to run.");
DEFINE_int64(cache_size, 8 * KB * KB,
"Number of bytes to use as a cache of uncompressed data.");
DEFINE_int32(num_shard_bits, 4, "shard_bits.");

DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache");
DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");

DEFINE_bool(populate_cache, false, "Populate cache before operations");
DEFINE_int32(insert_percent, 40,
"Ratio of insert to total workload (expressed as a percentage)");
DEFINE_int32(lookup_percent, 50,
"Ratio of lookup to total workload (expressed as a percentage)");
DEFINE_int32(erase_percent, 10,
"Ratio of erase to total workload (expressed as a percentage)");
static constexpr uint32_t KiB = uint32_t{1} << 10;
static constexpr uint32_t MiB = KiB << 10;
static constexpr uint64_t GiB = MiB << 10;

DEFINE_uint32(threads, 16, "Number of concurrent threads to run.");
DEFINE_uint64(cache_size, 1 * GiB,
"Number of bytes to use as a cache of uncompressed data.");
DEFINE_uint32(num_shard_bits, 6, "shard_bits.");

DEFINE_double(resident_ratio, 0.25,
"Ratio of keys fitting in cache to keyspace.");
DEFINE_uint64(ops_per_thread, 0,
"Number of operations per thread. (Default: 5 * keyspace size)");
DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");

DEFINE_uint32(skew, 5, "Degree of skew in key selection");
DEFINE_bool(populate_cache, true, "Populate cache before operations");

DEFINE_uint32(lookup_insert_percent, 87,
"Ratio of lookup (+ insert on not found) to total workload "
"(expressed as a percentage)");
DEFINE_uint32(insert_percent, 2,
"Ratio of insert to total workload (expressed as a percentage)");
DEFINE_uint32(lookup_percent, 10,
"Ratio of lookup to total workload (expressed as a percentage)");
DEFINE_uint32(erase_percent, 1,
"Ratio of erase to total workload (expressed as a percentage)");

DEFINE_bool(use_clock_cache, false, "");

namespace ROCKSDB_NAMESPACE {

class CacheBench;
namespace {
void deleter(const Slice& /*key*/, void* value) {
delete reinterpret_cast<char *>(value);
}

// State shared by all concurrent executions of the same benchmark.
class SharedState {
public:
explicit SharedState(CacheBench* cache_bench)
: cv_(&mu_),
num_threads_(FLAGS_threads),
num_initialized_(0),
start_(false),
num_done_(0),
cache_bench_(cache_bench) {
}
cache_bench_(cache_bench) {}

~SharedState() {}

Expand All @@ -87,13 +94,9 @@ class SharedState {
num_done_++;
}

bool AllInitialized() const {
return num_initialized_ >= num_threads_;
}
bool AllInitialized() const { return num_initialized_ >= FLAGS_threads; }

bool AllDone() const {
return num_done_ >= num_threads_;
}
bool AllDone() const { return num_done_ >= FLAGS_threads; }

void SetStart() {
start_ = true;
Expand All @@ -107,7 +110,6 @@ class SharedState {
port::Mutex mu_;
port::CondVar cv_;

const uint64_t num_threads_;
uint64_t num_initialized_;
bool start_;
uint64_t num_done_;
Expand All @@ -118,17 +120,69 @@ class SharedState {
// Per-thread state for concurrent executions of the same benchmark.
struct ThreadState {
uint32_t tid;
Random rnd;
Random64 rnd;
SharedState* shared;

ThreadState(uint32_t index, SharedState* _shared)
: tid(index), rnd(1000 + index), shared(_shared) {}
};

struct KeyGen {
char key_data[27];

Slice GetRand(Random64& rnd, uint64_t max_key) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < FLAGS_skew; ++i) {
raw = std::min(raw, rnd.Next());
}
uint64_t key = fastrange64(raw, max_key);
// Variable size and alignment
size_t off = key % 8;
key_data[0] = char{42};
EncodeFixed64(key_data + 1, key);
key_data[9] = char{11};
EncodeFixed64(key_data + 10, key);
key_data[18] = char{4};
EncodeFixed64(key_data + 19, key);
return Slice(&key_data[off], sizeof(key_data) - off);
}
};

char* createValue(Random64& rnd) {
char* rv = new char[FLAGS_value_bytes];
// Fill with some filler data, and take some CPU time
for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
EncodeFixed64(rv + i, rnd.Next());
}
return rv;
}

void deleter(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
} // namespace

class CacheBench {
static constexpr uint64_t kHundredthUint64 =
std::numeric_limits<uint64_t>::max() / 100U;

public:
CacheBench() : num_threads_(FLAGS_threads) {
CacheBench()
: max_key_(static_cast<uint64_t>(FLAGS_cache_size / FLAGS_resident_ratio /
FLAGS_value_bytes)),
lookup_insert_threshold_(kHundredthUint64 *
FLAGS_lookup_insert_percent),
insert_threshold_(lookup_insert_threshold_ +
kHundredthUint64 * FLAGS_insert_percent),
lookup_threshold_(insert_threshold_ +
kHundredthUint64 * FLAGS_lookup_percent),
erase_threshold_(lookup_threshold_ +
kHundredthUint64 * FLAGS_erase_percent) {
if (erase_threshold_ != 100U * kHundredthUint64) {
fprintf(stderr, "Percentages must add to 100.\n");
exit(1);
}
if (FLAGS_use_clock_cache) {
cache_ = NewClockCache(FLAGS_cache_size, FLAGS_num_shard_bits);
if (!cache_) {
Expand All @@ -138,18 +192,19 @@ class CacheBench {
} else {
cache_ = NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits);
}
if (FLAGS_ops_per_thread == 0) {
FLAGS_ops_per_thread = 5 * max_key_;
}
}

~CacheBench() {}

void PopulateCache() {
Random rnd(1);
for (int64_t i = 0; i < FLAGS_cache_size; i++) {
uint64_t rand_key = rnd.Next() % FLAGS_max_key;
// Cast uint64* to be char*, data would be copied to cache
Slice key(reinterpret_cast<char*>(&rand_key), 8);
// do insert
cache_->Insert(key, new char[10], 1, &deleter);
Random64 rnd(1);
KeyGen keygen;
for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
cache_->Insert(keygen.GetRand(rnd, max_key_), createValue(rnd),
FLAGS_value_bytes, &deleter);
}
}

Expand All @@ -158,10 +213,10 @@ class CacheBench {

PrintEnv();
SharedState shared(this);
std::vector<ThreadState*> threads(num_threads_);
for (uint32_t i = 0; i < num_threads_; i++) {
threads[i] = new ThreadState(i, &shared);
env->StartThread(ThreadBody, threads[i]);
std::vector<std::unique_ptr<ThreadState> > threads(FLAGS_threads);
for (uint32_t i = 0; i < FLAGS_threads; i++) {
threads[i].reset(new ThreadState(i, &shared));
env->StartThread(ThreadBody, threads[i].get());
}
{
MutexLock l(shared.GetMutex());
Expand Down Expand Up @@ -192,10 +247,15 @@ class CacheBench {

private:
std::shared_ptr<Cache> cache_;
uint32_t num_threads_;
const uint64_t max_key_;
// Cumulative thresholds in the space of a random uint64_t
const uint64_t lookup_insert_threshold_;
const uint64_t insert_threshold_;
const uint64_t lookup_threshold_;
const uint64_t erase_threshold_;

static void ThreadBody(void* v) {
ThreadState* thread = reinterpret_cast<ThreadState*>(v);
ThreadState* thread = static_cast<ThreadState*>(v);
SharedState* shared = thread->shared;

{
Expand All @@ -220,40 +280,78 @@ class CacheBench {
}

void OperateCache(ThreadState* thread) {
// To use looked-up values
uint64_t result = 0;
// To hold handles for a non-trivial amount of time
Cache::Handle* handle = nullptr;
KeyGen gen;
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key;
// Cast uint64* to be char*, data would be copied to cache
Slice key(reinterpret_cast<char*>(&rand_key), 8);
int32_t prob_op = thread->rnd.Uniform(100);
if (prob_op >= 0 && prob_op < FLAGS_insert_percent) {
// do insert
cache_->Insert(key, new char[10], 1, &deleter);
} else if (prob_op -= FLAGS_insert_percent &&
prob_op < FLAGS_lookup_percent) {
Slice key = gen.GetRand(thread->rnd, max_key_);
uint64_t random_op = thread->rnd.Next();
if (random_op < lookup_insert_threshold_) {
if (handle) {
cache_->Release(handle);
handle = nullptr;
}
// do lookup
auto handle = cache_->Lookup(key);
handle = cache_->Lookup(key);
if (handle) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
} else {
// do insert
cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
&deleter, &handle);
}
} else if (random_op < insert_threshold_) {
if (handle) {
cache_->Release(handle);
handle = nullptr;
}
} else if (prob_op -= FLAGS_lookup_percent &&
prob_op < FLAGS_erase_percent) {
// do insert
cache_->Insert(key, createValue(thread->rnd), FLAGS_value_bytes,
&deleter, &handle);
} else if (random_op < lookup_threshold_) {
if (handle) {
cache_->Release(handle);
handle = nullptr;
}
// do lookup
handle = cache_->Lookup(key);
if (handle) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else if (random_op < erase_threshold_) {
// do erase
cache_->Erase(key);
} else {
// Should be extremely unlikely (noop)
assert(random_op >= kHundredthUint64 * 100U);
}
}
if (handle) {
cache_->Release(handle);
handle = nullptr;
}
}

void PrintEnv() const {
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
printf("Number of threads : %d\n", FLAGS_threads);
printf("Number of threads : %u\n", FLAGS_threads);
printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread);
printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size);
printf("Num shard bits : %d\n", FLAGS_num_shard_bits);
printf("Max key : %" PRIu64 "\n", FLAGS_max_key);
printf("Populate cache : %d\n", FLAGS_populate_cache);
printf("Insert percentage : %d%%\n", FLAGS_insert_percent);
printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent);
printf("Erase percentage : %d%%\n", FLAGS_erase_percent);
printf("Num shard bits : %u\n", FLAGS_num_shard_bits);
printf("Max key : %" PRIu64 "\n", max_key_);
printf("Resident ratio : %g\n", FLAGS_resident_ratio);
printf("Skew degree : %u\n", FLAGS_skew);
printf("Populate cache : %d\n", int{FLAGS_populate_cache});
printf("Lookup+Insert pct : %u%%\n", FLAGS_lookup_insert_percent);
printf("Insert percentage : %u%%\n", FLAGS_insert_percent);
printf("Lookup percentage : %u%%\n", FLAGS_lookup_percent);
printf("Erase percentage : %u%%\n", FLAGS_erase_percent);
printf("----------------------------\n");
}
};
Expand All @@ -270,6 +368,8 @@ int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::CacheBench bench;
if (FLAGS_populate_cache) {
bench.PopulateCache();
printf("Population complete\n");
printf("----------------------------\n");
}
if (bench.Run()) {
return 0;
Expand Down
6 changes: 3 additions & 3 deletions db/error_handler_fs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -498,15 +498,15 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"VersionSet::LogAndApply:WriteManifest", [&](void*) {
if (fail_manifest.load()) {
fault_fs->SetFilesystemActive(false,error_msg); }
});
fault_fs->SetFilesystemActive(false, error_msg);
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();

Put(Key(1), "val");
s = Flush();
ASSERT_EQ(s, Status::OK());


TEST_SYNC_POINT("CompactionManifestWriteError:0");
TEST_SYNC_POINT("CompactionManifestWriteError:1");

Expand Down
3 changes: 1 addition & 2 deletions util/work_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <cassert>
#include <condition_variable>
#include <cstddef>
#include <cstddef>
#include <functional>
#include <mutex>
#include <queue>
Expand Down Expand Up @@ -146,4 +145,4 @@ class WorkQueue {
}
}
};
}
} // namespace ROCKSDB_NAMESPACE

0 comments on commit 079e77f

Please sign in to comment.