Skip to content

Commit

Permalink
MB-34445: RespondAmbiguous task should not own a vBucket.
Browse files Browse the repository at this point in the history
This causes deadlock due to recursive locking of tMutex in ExecutorPool
if the task is the last thing that owns a vBucket and is attempting to
schedule deferred deletion.

Fix this by holding a weak pointer instead. If we promote the pointer
then we are running normally and won't have previously acquired tMutex.
If we are cancelling the task at destruction of the engine, we will not
attempt to delete the vBucket.

15:05:37 #0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
15:05:38 #1  0x00007f551159fdbd in __GI___pthread_mutex_lock (mutex=0x7f5510435b08) at ../nptl/pthread_mutex_lock.c:80
15:05:38 #2  0x00007f550af2ef9d in __gthread_mutex_lock (__mutex=0x7f5510435b08) at /usr/local/include/c++/7.3.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748
15:05:38 #3  std::mutex::lock (this=0x7f5510435b08) at /usr/local/include/c++/7.3.0/bits/std_mutex.h:103
15:05:38 #4  std::lock_guard<std::mutex>::lock_guard (__m=..., this=<synthetic pointer>) at /usr/local/include/c++/7.3.0/bits/std_mutex.h:162
15:05:38 #5  ExecutorPool::_schedule (this=this@entry=0x7f5510435a00, task=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:420
15:05:38 #6  0x00007f550af2f13d in ExecutorPool::schedule (this=0x7f5510435a00, task=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:440
15:05:40 #7  0x00007f550af2ad1d in EphemeralVBucket::scheduleDeferredDeletion (this=<optimized out>, engine=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ephemeral_vb.cc:841
15:05:40 #8  0x00007f550af64dc1 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x7f550a3a9060) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:154
15:05:40 #9  std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=<optimized out>, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:684
15:05:40 #10 std::__shared_ptr<VBucket, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:1123
15:05:40 #11 std::shared_ptr<VBucket>::~shared_ptr (this=<optimized out>, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr.h:93
15:05:40 #12 RespondAmbiguousNotification::~RespondAmbiguousNotification (this=<optimized out>, __in_chrg=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/kv_bucket.cc:226
15:05:40 #13 0x00007f550af30849 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x7f550a41c340) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:154
15:05:40 #14 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=<optimized out>, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:684
15:05:40 #15 std::__shared_ptr<GlobalTask, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=0x7ffe6ec46730, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:1123
15:05:40 #16 std::shared_ptr<GlobalTask>::~shared_ptr (this=0x7ffe6ec46730, __in_chrg=<optimized out>) at /usr/local/include/c++/7.3.0/bits/shared_ptr.h:93
15:05:40 #17 ExecutorPool::_stopTaskGroup (this=<optimized out>, taskGID=140003322306560, taskType=<optimized out>, force=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:612
15:05:40 #18 0x00007f550af56cb8 in KVBucket::deinitialize (this=0x7f5510495000) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/kv_bucket.cc:466
15:05:40 #19 0x00007f550af09ee1 in EventuallyPersistentEngine::~EventuallyPersistentEngine (this=0x7f55104b2000, __in_chrg=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:6073
15:05:40 #20 0x00007f550af0a0e7 in EventuallyPersistentEngine::~EventuallyPersistentEngine (this=0x7f55104b2000, __in_chrg=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:6079
15:05:40 #21 EventuallyPersistentEngine::destroy (this=0x7f55104b2000, force=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:155
15:05:40 #22 0x000000000040ff0b in MockTestHarness::destroy_bucket (force=false, handle=0x7f55104f2aa0, this=0x64c740 <harness>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1178
15:05:40 #23 execute_test (default_cfg=<optimized out>, engine=<optimized out>, test=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1333
15:05:40 #24 main (argc=<optimized out>, argv=<optimized out>) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1581

Change-Id: I70298a8337967c648280b0d86a96c08bf3a4008a
Reviewed-on: http://review.couchbase.org/110146
Reviewed-by: Dave Rigby <[email protected]>
Tested-by: Build Bot <[email protected]>
  • Loading branch information
BenHuddleston authored and trondn committed Jun 6, 2019
1 parent 3303390 commit f8d08e0
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 3 deletions.
13 changes: 10 additions & 3 deletions engines/ep/src/kv_bucket.cc
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,10 @@ class RespondAmbiguousNotification : public GlobalTask {
VBucketPtr& vb,
std::vector<const void*> cookies)
: GlobalTask(&e, TaskId::RespondAmbiguousNotification, 0, false),
vbucket(vb),
weakVb(vb),
cookies(cookies),
description("Notify clients of Sync Write Ambiguous " +
vbucket->getId().to_string()) {
vb->getId().to_string()) {
for (const auto* cookie : cookies) {
if (!cookie) {
throw std::invalid_argument(
Expand All @@ -252,19 +252,26 @@ class RespondAmbiguousNotification : public GlobalTask {
}

bool run(void) {
auto vbucket = weakVb.lock();
if (!vbucket) {
return false;
}

TRACE_EVENT1("ep-engine/task",
"RespondAmbiguousNotification",
"vb",
(vbucket->getId()).get());

for (const auto* cookie : cookies) {
vbucket->notifyClientOfSyncWriteComplete(
cookie, ENGINE_SYNC_WRITE_AMBIGUOUS);
}

return false;
}

private:
VBucketPtr vbucket;
std::weak_ptr<VBucket> weakVb;
std::vector<const void*> cookies;
const std::string description;
};
Expand Down
67 changes: 67 additions & 0 deletions engines/ep/tests/module_tests/evp_store_durability_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ class DurabilityEphemeralBucketTest : public STParameterizedBucketTest {
void testPurgeCompletedPrepare(F& func);
};

/// Note - not single-threaded
class DurabilityRespondAmbiguousTest : public KVBucketTest {
protected:
void SetUp() override {
// The test should do the SetUp
}
void TearDown() override {
// The test should do the TearDown
};
};

void DurabilityEPBucketTest::testPersistPrepare(DocumentState docState) {
setVBucketStateAndRunPersistTask(
vbid,
Expand Down Expand Up @@ -840,6 +851,62 @@ TEST_P(DurabilityBucketTest, TakeoverSendsDurabilityAmbiguous) {
EXPECT_EQ(ENGINE_SYNC_WRITE_AMBIGUOUS, mockCookie->status);
}

TEST_F(DurabilityRespondAmbiguousTest, RespondAmbiguousNotificationDeadLock) {
// Anecdotally this takes between 0.5 and 1s to run on my dev machine
// (MB Pro 2017 - PCIe SSD). The test typically hits the issue on the 1st
// run but sometimes takes up to 5. I didn't want to increase the number
// of iterations as the test will obviously take far longer to run. If
// this test ever causes a timeout - a deadlock issue (probably in the
// RespondAmbiguousNotification task) is present.
for (int i = 0; i < 100; i++) {
KVBucketTest::SetUp();

EXPECT_EQ(ENGINE_SUCCESS,
store->setVBucketState(
vbid,
vbucket_state_active,
{{"topology",
nlohmann::json::array({{"active", "replica"}})}}));

auto key = makeStoredDocKey("key");
using namespace cb::durability;
auto pending = makePendingItem(key, "value");

// Store it
EXPECT_EQ(ENGINE_EWOULDBLOCK, store->set(*pending, cookie));

// We don't send EWOULDBLOCK to clients
auto mockCookie = cookie_to_mock_object(cookie);
EXPECT_EQ(ENGINE_SUCCESS, mockCookie->status);

// Set state to dead - this will schedule the task
EXPECT_EQ(ENGINE_SUCCESS,
store->setVBucketState(vbid, vbucket_state_dead));

// Deleting the vBucket will set the deferred deletion flag that
// causes deadlock when the RespondAmbiguousNotification task is
// destroyed as part of shutdown but is the last owner of the vBucket
// (attempts to schedule destruction and tries to recursively lock a
// mutex)
{
auto ptr = store->getVBucket(vbid);
store->deleteVBucket(vbid, nullptr);
}

destroy_mock_event_callbacks();
engine->getDcpConnMap().manageConnections();

// Should deadlock here in ~SynchronousEPEngine
engine.reset();

// The RespondAmbiguousNotification task requires our cookie to still be
// valid so delete it only after it has been destroyed
destroy_mock_cookie(cookie);

ExecutorPool::shutdown();
}
}

// Test that if a SyncWrite times out, then a subsequent SyncWrite which
// _should_ fail does indeed fail.
// (Regression test for part of MB-34367 - after using notify_IO_complete
Expand Down

0 comments on commit f8d08e0

Please sign in to comment.