From f8d08e0f112228d4220b7ec5a0dc95fb20c49e18 Mon Sep 17 00:00:00 2001 From: Ben Huddleston Date: Tue, 4 Jun 2019 16:45:55 +0100 Subject: [PATCH] MB-34445: RespondAmbiguous task should not own a vBucket. This causes deadlock due to recursive locking of tMutex in ExecutorPool if the task is the last thing that owns a vBucket and is attempting to schedule deferred deletion. Fix this by holding a weak pointer instead. If we promote the pointer then we are running normally and won't have previously acquired tMutex. If we are cancelling the task at destruction of the engine, we will not attempt to delete the vBucket. 15:05:37 #0 __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135 15:05:38 #1 0x00007f551159fdbd in __GI___pthread_mutex_lock (mutex=0x7f5510435b08) at ../nptl/pthread_mutex_lock.c:80 15:05:38 #2 0x00007f550af2ef9d in __gthread_mutex_lock (__mutex=0x7f5510435b08) at /usr/local/include/c++/7.3.0/x86_64-pc-linux-gnu/bits/gthr-default.h:748 15:05:38 #3 std::mutex::lock (this=0x7f5510435b08) at /usr/local/include/c++/7.3.0/bits/std_mutex.h:103 15:05:38 #4 std::lock_guard::lock_guard (__m=..., this=) at /usr/local/include/c++/7.3.0/bits/std_mutex.h:162 15:05:38 #5 ExecutorPool::_schedule (this=this@entry=0x7f5510435a00, task=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:420 15:05:38 #6 0x00007f550af2f13d in ExecutorPool::schedule (this=0x7f5510435a00, task=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:440 15:05:40 #7 0x00007f550af2ad1d in EphemeralVBucket::scheduleDeferredDeletion (this=, engine=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ephemeral_vb.cc:841 15:05:40 #8 0x00007f550af64dc1 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x7f550a3a9060) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:154 15:05:40 #9 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:684 15:05:40 #10 std::__shared_ptr::~__shared_ptr (this=, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:1123 15:05:40 #11 std::shared_ptr::~shared_ptr (this=, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr.h:93 15:05:40 #12 RespondAmbiguousNotification::~RespondAmbiguousNotification (this=, __in_chrg=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/kv_bucket.cc:226 15:05:40 #13 0x00007f550af30849 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x7f550a41c340) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:154 15:05:40 #14 std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:684 15:05:40 #15 std::__shared_ptr::~__shared_ptr (this=0x7ffe6ec46730, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr_base.h:1123 15:05:40 #16 std::shared_ptr::~shared_ptr (this=0x7ffe6ec46730, __in_chrg=) at /usr/local/include/c++/7.3.0/bits/shared_ptr.h:93 15:05:40 #17 ExecutorPool::_stopTaskGroup (this=, taskGID=140003322306560, taskType=, force=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/executorpool.cc:612 15:05:40 #18 0x00007f550af56cb8 in KVBucket::deinitialize (this=0x7f5510495000) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/kv_bucket.cc:466 15:05:40 #19 0x00007f550af09ee1 in EventuallyPersistentEngine::~EventuallyPersistentEngine (this=0x7f55104b2000, __in_chrg=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:6073 15:05:40 #20 0x00007f550af0a0e7 in EventuallyPersistentEngine::~EventuallyPersistentEngine (this=0x7f55104b2000, __in_chrg=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:6079 15:05:40 #21 EventuallyPersistentEngine::destroy (this=0x7f55104b2000, force=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/engines/ep/src/ep_engine.cc:155 15:05:40 #22 0x000000000040ff0b in MockTestHarness::destroy_bucket (force=false, handle=0x7f55104f2aa0, this=0x64c740 ) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1178 15:05:40 #23 execute_test (default_cfg=, engine=, test=...) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1333 15:05:40 #24 main (argc=, argv=) at /home/couchbase/jenkins/workspace/kv_engine-linux-master-CE/kv_engine/programs/engine_testapp/engine_testapp.cc:1581 Change-Id: I70298a8337967c648280b0d86a96c08bf3a4008a Reviewed-on: http://review.couchbase.org/110146 Reviewed-by: Dave Rigby Tested-by: Build Bot --- engines/ep/src/kv_bucket.cc | 13 +++- .../module_tests/evp_store_durability_test.cc | 67 +++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/engines/ep/src/kv_bucket.cc b/engines/ep/src/kv_bucket.cc index 08f2891d25..915e8879c1 100644 --- a/engines/ep/src/kv_bucket.cc +++ b/engines/ep/src/kv_bucket.cc @@ -229,10 +229,10 @@ class RespondAmbiguousNotification : public GlobalTask { VBucketPtr& vb, std::vector cookies) : GlobalTask(&e, TaskId::RespondAmbiguousNotification, 0, false), - vbucket(vb), + weakVb(vb), cookies(cookies), description("Notify clients of Sync Write Ambiguous " + - vbucket->getId().to_string()) { + vb->getId().to_string()) { for (const auto* cookie : cookies) { if (!cookie) { throw std::invalid_argument( @@ -252,19 +252,26 @@ class RespondAmbiguousNotification : public GlobalTask { } bool run(void) { + auto vbucket = weakVb.lock(); + if (!vbucket) { + return false; + } + TRACE_EVENT1("ep-engine/task", "RespondAmbiguousNotification", "vb", (vbucket->getId()).get()); + for (const auto* cookie : cookies) { vbucket->notifyClientOfSyncWriteComplete( cookie, ENGINE_SYNC_WRITE_AMBIGUOUS); } + return false; } private: - VBucketPtr vbucket; + std::weak_ptr weakVb; std::vector cookies; const std::string description; }; diff --git a/engines/ep/tests/module_tests/evp_store_durability_test.cc b/engines/ep/tests/module_tests/evp_store_durability_test.cc index 995f82d5b2..9c135e4afb 100644 --- a/engines/ep/tests/module_tests/evp_store_durability_test.cc +++ b/engines/ep/tests/module_tests/evp_store_durability_test.cc @@ -75,6 +75,17 @@ class DurabilityEphemeralBucketTest : public STParameterizedBucketTest { void testPurgeCompletedPrepare(F& func); }; +/// Note - not single-threaded +class DurabilityRespondAmbiguousTest : public KVBucketTest { +protected: + void SetUp() override { + // The test should do the SetUp + } + void TearDown() override { + // The test should do the TearDown + }; +}; + void DurabilityEPBucketTest::testPersistPrepare(DocumentState docState) { setVBucketStateAndRunPersistTask( vbid, @@ -840,6 +851,62 @@ TEST_P(DurabilityBucketTest, TakeoverSendsDurabilityAmbiguous) { EXPECT_EQ(ENGINE_SYNC_WRITE_AMBIGUOUS, mockCookie->status); } +TEST_F(DurabilityRespondAmbiguousTest, RespondAmbiguousNotificationDeadLock) { + // Anecdotally this takes between 0.5 and 1s to run on my dev machine + // (MB Pro 2017 - PCIe SSD). The test typically hits the issue on the 1st + // run but sometimes takes up to 5. I didn't want to increase the number + // of iterations as the test will obviously take far longer to run. If + // this test ever causes a timeout - a deadlock issue (probably in the + // RespondAmbiguousNotification task) is present. + for (int i = 0; i < 100; i++) { + KVBucketTest::SetUp(); + + EXPECT_EQ(ENGINE_SUCCESS, + store->setVBucketState( + vbid, + vbucket_state_active, + {{"topology", + nlohmann::json::array({{"active", "replica"}})}})); + + auto key = makeStoredDocKey("key"); + using namespace cb::durability; + auto pending = makePendingItem(key, "value"); + + // Store it + EXPECT_EQ(ENGINE_EWOULDBLOCK, store->set(*pending, cookie)); + + // We don't send EWOULDBLOCK to clients + auto mockCookie = cookie_to_mock_object(cookie); + EXPECT_EQ(ENGINE_SUCCESS, mockCookie->status); + + // Set state to dead - this will schedule the task + EXPECT_EQ(ENGINE_SUCCESS, + store->setVBucketState(vbid, vbucket_state_dead)); + + // Deleting the vBucket will set the deferred deletion flag that + // causes deadlock when the RespondAmbiguousNotification task is + // destroyed as part of shutdown but is the last owner of the vBucket + // (attempts to schedule destruction and tries to recursively lock a + // mutex) + { + auto ptr = store->getVBucket(vbid); + store->deleteVBucket(vbid, nullptr); + } + + destroy_mock_event_callbacks(); + engine->getDcpConnMap().manageConnections(); + + // Should deadlock here in ~SynchronousEPEngine + engine.reset(); + + // The RespondAmbiguousNotification task requires our cookie to still be + // valid so delete it only after it has been destroyed + destroy_mock_cookie(cookie); + + ExecutorPool::shutdown(); + } +} + // Test that if a SyncWrite times out, then a subsequent SyncWrite which // _should_ fail does indeed fail. // (Regression test for part of MB-34367 - after using notify_IO_complete