From bfa78a53210c28ed108733bdd8eb508d317d9dd0 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Fri, 6 Dec 2024 12:12:12 -0800 Subject: [PATCH 1/9] Eliminated the unused Controller's function of tracking duplicate requests This function that was provisioned in the original design of the Replication System has shown no benefits. Besides, the "duplicate" requests can't be reliably detected since the detection algorithm is time dependent. --- src/replica/apps/ControllerApp.cc | 10 +--- src/replica/apps/ControllerApp.h | 5 -- src/replica/jobs/CreateReplicaJob.cc | 3 +- src/replica/jobs/DeleteReplicaJob.cc | 3 +- src/replica/jobs/FixUpJob.cc | 3 +- src/replica/proto/protocol.proto | 7 --- src/replica/requests/DeleteRequest.cc | 23 +++------- src/replica/requests/DeleteRequest.h | 6 +-- src/replica/requests/DirectorIndexRequest.cc | 4 +- src/replica/requests/DisposeRequest.cc | 4 +- src/replica/requests/EchoRequest.cc | 4 +- src/replica/requests/FindAllRequest.cc | 3 +- src/replica/requests/FindRequest.cc | 4 +- src/replica/requests/ReplicationRequest.cc | 23 +++------- src/replica/requests/ReplicationRequest.h | 5 +- src/replica/requests/Request.cc | 7 +-- src/replica/requests/Request.h | 32 +------------ src/replica/requests/RequestMessenger.cc | 4 +- src/replica/requests/RequestMessenger.h | 3 +- .../requests/ServiceManagementRequestBase.cc | 3 +- src/replica/requests/SqlRequest.cc | 4 +- src/replica/requests/StatusRequest.cc | 3 +- src/replica/requests/StopRequest.cc | 3 +- src/replica/worker/WorkerProcessor.cc | 46 ------------------- 24 files changed, 38 insertions(+), 174 deletions(-) diff --git a/src/replica/apps/ControllerApp.cc b/src/replica/apps/ControllerApp.cc index 01c82a53d1..67d3b00178 100644 --- a/src/replica/apps/ControllerApp.cc +++ b/src/replica/apps/ControllerApp.cc @@ -129,11 +129,6 @@ void ControllerApp::_configureParser() { _cancelDelayMilliseconds) .option("priority", "The priority level of a request", _priority) .flag("do-not-track", "Do not track requests by waiting before they finish.", _doNotTrackRequest) - .flag("allow-duplicates", - "Allow requests which duplicate the previously made one. This applies" - " to requests which change the replica disposition at a worker, and only" - " for those requests which are still in the worker's queues.", - _allowDuplicates) .flag("do-not-save-replica", "The flag which (if used) prevents the application from saving replica info in a database." " This may significantly speed up the application in setups where the number of chunks is " @@ -415,11 +410,10 @@ int ControllerApp::runImpl() { request = ReplicationRequest::createAndStart( controller, _workerName, _sourceWorkerName, _databaseName, _chunkNumber, [](ReplicationRequest::Ptr const& request_) { request_->print(); }, _priority, - !_doNotTrackRequest, _allowDuplicates); + !_doNotTrackRequest); } else if ("DELETE" == _requestType) { request = DeleteRequest::createAndStart(controller, _workerName, _databaseName, _chunkNumber, - Request::defaultPrinter, _priority, !_doNotTrackRequest, - _allowDuplicates); + Request::defaultPrinter, _priority, !_doNotTrackRequest); } else if ("FIND" == _requestType) { request = FindRequest::createAndStart(controller, _workerName, _databaseName, _chunkNumber, Request::defaultPrinter, _priority, _computeCheckSum, diff --git a/src/replica/apps/ControllerApp.h b/src/replica/apps/ControllerApp.h index 5b6a100a0b..8fc349d4b2 100644 --- a/src/replica/apps/ControllerApp.h +++ b/src/replica/apps/ControllerApp.h @@ -171,11 +171,6 @@ class ControllerApp : public Application { /// Do not track requests waiting before they finish bool _doNotTrackRequest = false; - /// Allow requests which duplicate the previously made one. This applies - /// to requests which change the replica disposition at a worker, and only - /// for those requests which are still in the worker's queues. - bool _allowDuplicates = false; - /// Do not save the replica info in the database if set to 'true' bool _doNotSaveReplicaInfo = false; diff --git a/src/replica/jobs/CreateReplicaJob.cc b/src/replica/jobs/CreateReplicaJob.cc index 94a393ca8e..cf6c8e5c9f 100644 --- a/src/replica/jobs/CreateReplicaJob.cc +++ b/src/replica/jobs/CreateReplicaJob.cc @@ -203,14 +203,13 @@ void CreateReplicaJob::startImpl(replica::Lock const& lock) { // VERY IMPORTANT: the requests are sent for participating databases // only because some catalogs may not have a full coverage bool const keepTracking = true; - bool const allowDuplicate = true; for (auto&& replica : sourceReplicas) { _requests.push_back(ReplicationRequest::createAndStart( controller(), destinationWorker(), sourceWorker(), replica.database(), chunk(), [self = shared_from_base()](ReplicationRequest::Ptr ptr) { self->_onRequestFinish(ptr); }, - priority(), keepTracking, allowDuplicate, id())); + priority(), keepTracking, id())); } } diff --git a/src/replica/jobs/DeleteReplicaJob.cc b/src/replica/jobs/DeleteReplicaJob.cc index b9c2887279..b8ef4d7f61 100644 --- a/src/replica/jobs/DeleteReplicaJob.cc +++ b/src/replica/jobs/DeleteReplicaJob.cc @@ -235,14 +235,13 @@ void DeleteReplicaJob::_beginDeleteReplica(replica::Lock const& lock) { // VERY IMPORTANT: the requests are sent for participating databases // only because some catalogs may not have a full coverage bool const keepTracking = true; - bool const allowDuplicate = true; for (auto&& replica : _replicas) { _requests.push_back(DeleteRequest::createAndStart( controller(), workerName(), replica.database(), chunk(), [self = shared_from_base()](DeleteRequest::Ptr ptr) { self->_onRequestFinish(ptr); }, - priority(), keepTracking, allowDuplicate, id())); + priority(), keepTracking, id())); } } diff --git a/src/replica/jobs/FixUpJob.cc b/src/replica/jobs/FixUpJob.cc index 8aa67874e7..5393418c8f 100644 --- a/src/replica/jobs/FixUpJob.cc +++ b/src/replica/jobs/FixUpJob.cc @@ -257,7 +257,6 @@ size_t FixUpJob::_launchNext(replica::Lock const& lock, string const& destinatio if (maxRequests == 0) return 0; auto&& tasks = _destinationWorker2tasks[destinationWorker]; bool const keepTracking = true; - bool const allowDuplicate = true; size_t numLaunched = 0; for (size_t i = 0; i < maxRequests; ++i) { if (tasks.size() == 0) break; @@ -270,7 +269,7 @@ size_t FixUpJob::_launchNext(replica::Lock const& lock, string const& destinatio [self = shared_from_base()](ReplicationRequest::Ptr ptr) { self->_onRequestFinish(ptr); }, - priority(), keepTracking, allowDuplicate, id())); + priority(), keepTracking, id())); tasks.pop(); numLaunched++; } diff --git a/src/replica/proto/protocol.proto b/src/replica/proto/protocol.proto index b50558de43..5e2fd54dd7 100644 --- a/src/replica/proto/protocol.proto +++ b/src/replica/proto/protocol.proto @@ -399,7 +399,6 @@ enum ProtocolStatusExt { NONE = 0; // unspecified problem INVALID_PARAM = 1; // invalid parameter(s) of a request INVALID_ID = 2; // an invalid request identifier - DUPLICATE = 3; // a duplicate request FOLDER_STAT = 4; // failed to obtain fstat() for a folder FOLDER_CREATE = 5; // failed to create a folder FILE_STAT = 6; // failed to obtain fstat() for a file @@ -487,9 +486,6 @@ message ProtocolResponseReplicate { /// Extended status of this operation optional ProtocolStatusExt status_ext = 2 [default = NONE]; - /// The field is set for duplicate requests only - optional string duplicate_request_id = 3 [default = ""]; - /// The performance of this operation required ProtocolPerformance performance = 4; @@ -519,9 +515,6 @@ message ProtocolResponseDelete { /// Extended status of this operation optional ProtocolStatusExt status_ext = 2 [default = NONE]; - /// The field is set for duplicate requests only - optional string duplicate_request_id = 3 [default = ""]; - /// The performance of this operation required ProtocolPerformance performance = 4; diff --git a/src/replica/requests/DeleteRequest.cc b/src/replica/requests/DeleteRequest.cc index d8f6163a65..9b3dfb9fe8 100644 --- a/src/replica/requests/DeleteRequest.cc +++ b/src/replica/requests/DeleteRequest.cc @@ -53,18 +53,18 @@ namespace lsst::qserv::replica { DeleteRequest::Ptr DeleteRequest::createAndStart(shared_ptr const& controller, string const& workerName, string const& database, unsigned int chunk, CallbackType const& onFinish, - int priority, bool keepTracking, bool allowDuplicate, - string const& jobId, unsigned int requestExpirationIvalSec) { - auto ptr = DeleteRequest::Ptr(new DeleteRequest(controller, workerName, database, chunk, onFinish, - priority, keepTracking, allowDuplicate)); + int priority, bool keepTracking, string const& jobId, + unsigned int requestExpirationIvalSec) { + auto ptr = DeleteRequest::Ptr( + new DeleteRequest(controller, workerName, database, chunk, onFinish, priority, keepTracking)); ptr->start(jobId, requestExpirationIvalSec); return ptr; } DeleteRequest::DeleteRequest(shared_ptr const& controller, string const& workerName, string const& database, unsigned int chunk, CallbackType const& onFinish, - int priority, bool keepTracking, bool allowDuplicate) - : RequestMessenger(controller, "REPLICA_DELETE", workerName, priority, keepTracking, allowDuplicate, + int priority, bool keepTracking) + : RequestMessenger(controller, "REPLICA_DELETE", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), @@ -117,7 +117,7 @@ void DeleteRequest::awaken(boost::system::error_code const& ec) { buffer()->serialize(hdr); ProtocolRequestTrack message; - message.set_id(remoteId()); + message.set_id(id()); message.set_queued_type(ProtocolQueuedRequestType::REPLICA_DELETE); buffer()->serialize(message); @@ -194,15 +194,6 @@ void DeleteRequest::_analyze(bool success, ProtocolResponseDelete const& message break; case ProtocolStatus::BAD: - // Special treatment of the duplicate requests if allowed - if (extendedServerStatus() == ProtocolStatusExt::DUPLICATE) { - setDuplicateRequestId(lock, message.duplicate_request_id()); - if (allowDuplicate() && keepTracking()) { - timer().expires_from_now(boost::posix_time::milliseconds(nextTimeIvalMsec())); - timer().async_wait(bind(&DeleteRequest::awaken, shared_from_base(), _1)); - return; - } - } finish(lock, SERVER_BAD); break; diff --git a/src/replica/requests/DeleteRequest.h b/src/replica/requests/DeleteRequest.h index e705c27f06..bab621f174 100644 --- a/src/replica/requests/DeleteRequest.h +++ b/src/replica/requests/DeleteRequest.h @@ -88,8 +88,8 @@ class DeleteRequest : public RequestMessenger { static Ptr createAndStart(std::shared_ptr const& controller, std::string const& workerName, std::string const& database, unsigned int chunk, CallbackType const& onFinish = nullptr, int priority = PRIORITY_NORMAL, - bool keepTracking = true, bool allowDuplicate = true, - std::string const& jobId = "", unsigned int requestExpirationIvalSec = 0); + bool keepTracking = true, std::string const& jobId = "", + unsigned int requestExpirationIvalSec = 0); protected: void startImpl(replica::Lock const& lock) final; @@ -101,7 +101,7 @@ class DeleteRequest : public RequestMessenger { private: DeleteRequest(std::shared_ptr const& controller, std::string const& workerName, std::string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, - bool keepTracking, bool allowDuplicate); + bool keepTracking); /** * Send the serialized content of the buffer to a worker. diff --git a/src/replica/requests/DirectorIndexRequest.cc b/src/replica/requests/DirectorIndexRequest.cc index 0b469810fd..bc1b144ed9 100644 --- a/src/replica/requests/DirectorIndexRequest.cc +++ b/src/replica/requests/DirectorIndexRequest.cc @@ -48,7 +48,6 @@ namespace fs = boost::filesystem; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.DirectorIndexRequest"); -bool const allowDuplicateNo = false; bool const disposeRequired = true; } // namespace @@ -77,8 +76,7 @@ DirectorIndexRequest::DirectorIndexRequest(std::shared_ptr const& co string const& directorTable, unsigned int chunk, bool hasTransactions, TransactionId transactionId, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "INDEX", workerName, priority, keepTracking, ::allowDuplicateNo, - ::disposeRequired), + : RequestMessenger(controller, "INDEX", workerName, priority, keepTracking, ::disposeRequired), _database(database), _directorTable(directorTable), _chunk(chunk), diff --git a/src/replica/requests/DisposeRequest.cc b/src/replica/requests/DisposeRequest.cc index 699707158d..0f57b28d7e 100644 --- a/src/replica/requests/DisposeRequest.cc +++ b/src/replica/requests/DisposeRequest.cc @@ -41,7 +41,6 @@ using namespace std::placeholders; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.DisposeRequest"); -bool const allowDuplicateNo = false; bool const disposeRequiredNo = false; } // namespace @@ -83,8 +82,7 @@ DisposeRequest::Ptr DisposeRequest::createAndStart(shared_ptr const& DisposeRequest::DisposeRequest(shared_ptr const& controller, string const& workerName, std::vector const& targetIds, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "DISPOSE", workerName, priority, keepTracking, ::allowDuplicateNo, - ::disposeRequiredNo), + : RequestMessenger(controller, "DISPOSE", workerName, priority, keepTracking, ::disposeRequiredNo), _targetIds(targetIds), _onFinish(onFinish) {} diff --git a/src/replica/requests/EchoRequest.cc b/src/replica/requests/EchoRequest.cc index d713128e6a..a066445434 100644 --- a/src/replica/requests/EchoRequest.cc +++ b/src/replica/requests/EchoRequest.cc @@ -45,7 +45,6 @@ using namespace std::placeholders; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.EchoRequest"); -bool const allowDuplicateNo = false; bool const disposeRequired = true; } // namespace @@ -65,8 +64,7 @@ EchoRequest::Ptr EchoRequest::createAndStart(shared_ptr const& contr EchoRequest::EchoRequest(shared_ptr const& controller, string const& workerName, string const& data, uint64_t delay, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "TEST_ECHO", workerName, priority, keepTracking, ::allowDuplicateNo, - ::disposeRequired), + : RequestMessenger(controller, "TEST_ECHO", workerName, priority, keepTracking, ::disposeRequired), _data(data), _delay(delay), _onFinish(onFinish) {} diff --git a/src/replica/requests/FindAllRequest.cc b/src/replica/requests/FindAllRequest.cc index c3f501fdab..b4fa178fe7 100644 --- a/src/replica/requests/FindAllRequest.cc +++ b/src/replica/requests/FindAllRequest.cc @@ -45,7 +45,6 @@ using namespace std::placeholders; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.FindAllRequest"); -bool const allowDuplicateNo = false; bool const disposeRequired = true; } // namespace @@ -66,7 +65,7 @@ FindAllRequest::FindAllRequest(shared_ptr const& controller, string string const& database, bool saveReplicaInfo, CallbackType const& onFinish, int priority, bool keepTracking) : RequestMessenger(controller, "REPLICA_FIND_ALL", workerName, priority, keepTracking, - ::allowDuplicateNo, ::disposeRequired), + ::disposeRequired), _database(database), _saveReplicaInfo(saveReplicaInfo), _onFinish(onFinish) { diff --git a/src/replica/requests/FindRequest.cc b/src/replica/requests/FindRequest.cc index fc6d255474..4a23224d8f 100644 --- a/src/replica/requests/FindRequest.cc +++ b/src/replica/requests/FindRequest.cc @@ -46,7 +46,6 @@ using namespace std::placeholders; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.FindRequest"); -bool const allowDuplicateNo = false; bool const disposeRequired = true; } // namespace @@ -66,8 +65,7 @@ FindRequest::Ptr FindRequest::createAndStart(shared_ptr const& contr FindRequest::FindRequest(shared_ptr const& controller, string const& workerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, bool computeCheckSum, bool keepTracking) - : RequestMessenger(controller, "REPLICA_FIND", workerName, priority, keepTracking, ::allowDuplicateNo, - ::disposeRequired), + : RequestMessenger(controller, "REPLICA_FIND", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), _computeCheckSum(computeCheckSum), diff --git a/src/replica/requests/ReplicationRequest.cc b/src/replica/requests/ReplicationRequest.cc index f44d8e436a..4e9da5e10d 100644 --- a/src/replica/requests/ReplicationRequest.cc +++ b/src/replica/requests/ReplicationRequest.cc @@ -53,10 +53,9 @@ namespace lsst::qserv::replica { ReplicationRequest::Ptr ReplicationRequest::createAndStart( shared_ptr const& controller, string const& workerName, string const& sourceWorkerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, - bool keepTracking, bool allowDuplicate, string const& jobId, unsigned int requestExpirationIvalSec) { - auto ptr = ReplicationRequest::Ptr(new ReplicationRequest(controller, workerName, sourceWorkerName, - database, chunk, onFinish, priority, - keepTracking, allowDuplicate)); + bool keepTracking, string const& jobId, unsigned int requestExpirationIvalSec) { + auto ptr = ReplicationRequest::Ptr(new ReplicationRequest( + controller, workerName, sourceWorkerName, database, chunk, onFinish, priority, keepTracking)); ptr->start(jobId, requestExpirationIvalSec); return ptr; } @@ -64,8 +63,8 @@ ReplicationRequest::Ptr ReplicationRequest::createAndStart( ReplicationRequest::ReplicationRequest(shared_ptr const& controller, string const& workerName, string const& sourceWorkerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, - bool keepTracking, bool allowDuplicate) - : RequestMessenger(controller, "REPLICA_CREATE", workerName, priority, keepTracking, allowDuplicate, + bool keepTracking) + : RequestMessenger(controller, "REPLICA_CREATE", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), @@ -127,7 +126,7 @@ void ReplicationRequest::awaken(boost::system::error_code const& ec) { buffer()->serialize(hdr); ProtocolRequestTrack message; - message.set_id(remoteId()); + message.set_id(id()); message.set_queued_type(ProtocolQueuedRequestType::REPLICA_CREATE); buffer()->serialize(message); @@ -197,16 +196,6 @@ void ReplicationRequest::_analyze(bool success, ProtocolResponseReplicate const& keepTrackingOrFinish(lock, SERVER_IS_CANCELLING); break; case ProtocolStatus::BAD: - // Special treatment of the duplicate requests if allowed - if (extendedServerStatus() == ProtocolStatusExt::DUPLICATE) { - setDuplicateRequestId(lock, message.duplicate_request_id()); - if (allowDuplicate() && keepTracking()) { - timer().expires_from_now(boost::posix_time::milliseconds(nextTimeIvalMsec())); - timer().async_wait( - bind(&ReplicationRequest::awaken, shared_from_base(), _1)); - return; - } - } finish(lock, SERVER_BAD); break; case ProtocolStatus::FAILED: diff --git a/src/replica/requests/ReplicationRequest.h b/src/replica/requests/ReplicationRequest.h index 33943965f0..74c08c6043 100644 --- a/src/replica/requests/ReplicationRequest.h +++ b/src/replica/requests/ReplicationRequest.h @@ -94,8 +94,7 @@ class ReplicationRequest : public RequestMessenger { std::string const& sourceWorkerName, std::string const& database, unsigned int chunk, CallbackType const& onFinish = nullptr, int priority = PRIORITY_NORMAL, bool keepTracking = true, - bool allowDuplicate = true, std::string const& jobId = "", - unsigned int requestExpirationIvalSec = 0); + std::string const& jobId = "", unsigned int requestExpirationIvalSec = 0); /// @see Request::extendedPersistentState() std::list> extendedPersistentState() const override; @@ -109,7 +108,7 @@ class ReplicationRequest : public RequestMessenger { private: ReplicationRequest(std::shared_ptr const& controller, std::string const& workerName, std::string const& sourceWorkerName, std::string const& database, unsigned int chunk, - CallbackType const& onFinish, int priority, bool keepTracking, bool allowDuplicate); + CallbackType const& onFinish, int priority, bool keepTracking); /** * Send the serialized content of the buffer to a worker diff --git a/src/replica/requests/Request.cc b/src/replica/requests/Request.cc index 4f6c2a6173..1e000e3671 100644 --- a/src/replica/requests/Request.cc +++ b/src/replica/requests/Request.cc @@ -105,14 +105,13 @@ string Request::state2string(State state, ExtendedState extendedState, } Request::Request(shared_ptr const& controller, string const& type, string const& workerName, - int priority, bool keepTracking, bool allowDuplicate, bool disposeRequired) + int priority, bool keepTracking, bool disposeRequired) : _controller(controller), _type(type), _id(Generators::uniqueId()), _workerName(workerName), _priority(priority), _keepTracking(keepTracking), - _allowDuplicate(allowDuplicate), _disposeRequired(disposeRequired), _state(CREATED), _extendedState(NONE), @@ -149,8 +148,6 @@ string Request::context() const { "::" + replica::status2string(extendedServerStatus()) + " "; } -string const& Request::remoteId() const { return _duplicateRequestId.empty() ? _id : _duplicateRequestId; } - unsigned int Request::nextTimeIvalMsec() { auto result = _currentTimeIvalMsec; _currentTimeIvalMsec = min(2 * _currentTimeIvalMsec, 1000 * timerIvalSec()); @@ -170,9 +167,7 @@ string Request::toString(bool extended) const { << " worker: " << workerName() << "\n" << " priority: " << priority() << "\n" << " keepTracking: " << bool2str(keepTracking()) << "\n" - << " allowDuplicate: " << bool2str(allowDuplicate()) << "\n" << " disposeRequired: " << bool2str(disposeRequired()) << "\n" - << " remoteId: " << remoteId() << "\n" << " performance: " << performance() << "\n"; if (extended) { for (auto&& kv : extendedPersistentState()) { diff --git a/src/replica/requests/Request.h b/src/replica/requests/Request.h index 3f66b4ece6..82f47d65e5 100644 --- a/src/replica/requests/Request.h +++ b/src/replica/requests/Request.h @@ -62,7 +62,6 @@ namespace lsst::qserv::replica { * the request. The functin type is specific for each subclass. * @param priority The (optional) priority level of the request. * @param keepTracking The (optional) flagg to keep tracking the request before it finishes or fails. - * @param allowDuplicate (optional) Follow a previously made request if the current one duplicates it. * @param jobId The (optional) unique identifier of a job to which the request belongs. * @param requestExpirationIvalSec The (optional) time in seconds after which the request * will expire. The default value of '0' means an effective expiration time will be pull @@ -162,14 +161,6 @@ class Request : public std::enable_shared_from_this { /// @return a unique identifier of the request std::string const& id() const { return _id; } - /** - * Normally this is the same request as the one a request object is created with - * unless allowing to track duplicate requests (see constructor's options: 'keepTracking' - * and 'allowDuplicate') and after the one is found. - * @return an effective identifier of a remote (worker-side) request. - */ - std::string const& remoteId() const; - /// @return the priority level of the request int priority() const { return _priority; } @@ -264,7 +255,7 @@ class Request : public std::enable_shared_from_this { /** * Construct the request with the pointer to the services provider. * - * @note options 'keepTracking', 'allowDuplicate' and 'disposeRequired' + * @note options 'keepTracking' and 'disposeRequired' * have effect for specific request only. * * @param controller The Controller associated with the request. @@ -275,15 +266,12 @@ class Request : public std::enable_shared_from_this { * the request by the worker service. It may also affect an order requests * are processed locally. Higher number means higher priority. * @param keepTracking Keep tracking the request before it finishes or fails - * @param allowDuplicate Follow a previously made request if the current one - * duplicates it. * @param disposeRequired The flag indicating of the worker-side request * disposal is needed for a particular request. Normally, it's required for * requests which are queued by workers in its processing queues. */ Request(std::shared_ptr const& controller, std::string const& type, - std::string const& workerName, int priority, bool keepTracking, bool allowDuplicate, - bool disposeRequired); + std::string const& workerName, int priority, bool keepTracking, bool disposeRequired); /// @return A shared pointer of the desired subclass (no dynamic type checking) template @@ -311,9 +299,6 @@ class Request : public std::enable_shared_from_this { /// @return If 'true' then track request completion (queued requests only) bool keepTracking() const { return _keepTracking; } - /// @return If 'true' then follow a previously made request if the current one duplicates it. - bool allowDuplicate() const { return _allowDuplicate; } - /// @return If 'true' the request needs to be disposed at the worker's side upon /// a completion of an operation. bool disposeRequired() const { return _disposeRequired; } @@ -370,13 +355,6 @@ class Request : public std::enable_shared_from_this { _extendedServerStatus = status; } - /** - * Set an effective identifier of a remote (worker-side) request - * @param lock A lock on Request::_mtx must be acquired before calling this method. - * @param id An identifier to be set. - */ - void setDuplicateRequestId(replica::Lock const& lock, std::string const& id) { _duplicateRequestId = id; } - /** * This method is supposed to be provided by subclasses for additional * subclass-specific actions to begin processing the request. @@ -535,14 +513,8 @@ class Request : public std::enable_shared_from_this { int const _priority; bool const _keepTracking; - bool const _allowDuplicate; bool const _disposeRequired; - /// An effective identifier of a remote (worker-side) request where - /// this applies. Note that the duplicate requests are discovered - /// in a course of communication with worker services. - std::string _duplicateRequestId; - // 2-level state of a request std::atomic _state; diff --git a/src/replica/requests/RequestMessenger.cc b/src/replica/requests/RequestMessenger.cc index 032f68b364..cfb4e5e8a3 100644 --- a/src/replica/requests/RequestMessenger.cc +++ b/src/replica/requests/RequestMessenger.cc @@ -44,8 +44,8 @@ namespace lsst::qserv::replica { RequestMessenger::RequestMessenger(shared_ptr const& controller, string const& type, string const& workerName, int priority, bool keepTracking, - bool allowDuplicate, bool disposeRequired) - : Request(controller, type, workerName, priority, keepTracking, allowDuplicate, disposeRequired) {} + bool disposeRequired) + : Request(controller, type, workerName, priority, keepTracking, disposeRequired) {} void RequestMessenger::finishImpl(replica::Lock const& lock) { LOGS(_log, LOG_LVL_DEBUG, context() << __func__); diff --git a/src/replica/requests/RequestMessenger.h b/src/replica/requests/RequestMessenger.h index 13802907db..e91996c1d8 100644 --- a/src/replica/requests/RequestMessenger.h +++ b/src/replica/requests/RequestMessenger.h @@ -70,8 +70,7 @@ class RequestMessenger : public Request { * @return A pointer to the created object. */ RequestMessenger(std::shared_ptr const& controller, std::string const& type, - std::string const& workerName, int priority, bool keepTracking, bool allowDuplicate, - bool disposeRequired); + std::string const& workerName, int priority, bool keepTracking, bool disposeRequired); /// @see Request::finishImpl() void finishImpl(replica::Lock const& lock) override; diff --git a/src/replica/requests/ServiceManagementRequestBase.cc b/src/replica/requests/ServiceManagementRequestBase.cc index 2919899bfb..f56a757b3b 100644 --- a/src/replica/requests/ServiceManagementRequestBase.cc +++ b/src/replica/requests/ServiceManagementRequestBase.cc @@ -57,7 +57,6 @@ void dumpRequestInfo(ostream& os, vector const& req } bool const keepTrackingNo = false; -bool const allowDuplicateNo = false; bool const disposeRequiredNo = false; } // namespace @@ -170,7 +169,7 @@ ServiceManagementRequestBase::ServiceManagementRequestBase(shared_ptr const& controller, std::string const& requestName, string const& workerName, uint64_t maxRows, int priority, bool keepTracking) - : RequestMessenger(controller, requestName, workerName, priority, keepTracking, ::allowDuplicateNo, - ::disposeRequired) { + : RequestMessenger(controller, requestName, workerName, priority, keepTracking, ::disposeRequired) { // Partial initialization of the request body's content. Other members // will be set in the request type-specific subclasses. requestBody.set_max_rows(maxRows); diff --git a/src/replica/requests/StatusRequest.cc b/src/replica/requests/StatusRequest.cc index 237ef3716c..1826e8c6a0 100644 --- a/src/replica/requests/StatusRequest.cc +++ b/src/replica/requests/StatusRequest.cc @@ -39,7 +39,6 @@ using namespace std; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.StatusRequest"); -bool const allowDuplicateNo = false; bool const disposeRequiredNo = false; } // namespace @@ -60,7 +59,7 @@ StatusRequest::StatusRequest(shared_ptr const& controller, string co string const& targetRequestId, CallbackType const& onFinish, int priority, bool keepTracking) : RequestMessenger(controller, "REQUEST_STATUS", workerName, priority, keepTracking, - ::allowDuplicateNo, ::disposeRequiredNo), + ::disposeRequiredNo), _targetRequestId(targetRequestId), _onFinish(onFinish) {} diff --git a/src/replica/requests/StopRequest.cc b/src/replica/requests/StopRequest.cc index 21dadc1e25..4f1bfd3a3d 100644 --- a/src/replica/requests/StopRequest.cc +++ b/src/replica/requests/StopRequest.cc @@ -36,7 +36,6 @@ using namespace std; namespace { LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.StopRequest"); -bool const allowDuplicateNo = false; bool const disposeRequiredNo = false; } // namespace @@ -55,7 +54,7 @@ StopRequest::Ptr StopRequest::createAndStart(shared_ptr const& contr StopRequest::StopRequest(shared_ptr const& controller, string const& workerName, string const& targetRequestId, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REQUEST_STOP", workerName, priority, keepTracking, ::allowDuplicateNo, + : RequestMessenger(controller, "REQUEST_STOP", workerName, priority, keepTracking, ::disposeRequiredNo), _targetRequestId(targetRequestId), _onFinish(onFinish) {} diff --git a/src/replica/worker/WorkerProcessor.cc b/src/replica/worker/WorkerProcessor.cc index d1ddca60fc..bbae7a6270 100644 --- a/src/replica/worker/WorkerProcessor.cc +++ b/src/replica/worker/WorkerProcessor.cc @@ -49,31 +49,7 @@ using namespace std::placeholders; using namespace lsst::qserv::replica; namespace { - LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerProcessor"); - -template -bool ifDuplicateRequest(PROTOCOL_RESPONSE_TYPE& response, WorkerRequest::Ptr const& p, - PROTOCOL_REQUEST_TYPE const& request) { - bool isDuplicate = false; - - auto const ptr = dynamic_pointer_cast(p); - if (nullptr != ptr) { - isDuplicate = (ptr->database() == request.database()) and (ptr->chunk() == request.chunk()); - - } else { - auto const ptr = dynamic_pointer_cast(p); - if (nullptr != ptr) { - isDuplicate = (ptr->database() == request.database()) and (ptr->chunk() == request.chunk()); - } - } - if (isDuplicate) { - WorkerProcessor::setDefaultResponse(response, ProtocolStatus::BAD, ProtocolStatusExt::DUPLICATE); - response.set_duplicate_request_id(p->id()); - } - return isDuplicate; -} - } // namespace namespace lsst::qserv::replica { @@ -179,17 +155,6 @@ void WorkerProcessor::enqueueForReplication(string const& id, int32_t priority, replica::Lock lock(_mtx, _context(__func__)); - // Verify a scope of the request to ensure it won't duplicate or interfere (with) - // existing requests in the active (non-completed) queues. A reason why we're ignoring - // the completed is that this replica may have already been deleted from this worker. - - for (auto&& ptr : _newRequests) { - if (::ifDuplicateRequest(response, ptr, request)) return; - } - for (auto&& entry : _inProgressRequests) { - if (::ifDuplicateRequest(response, entry.second, request)) return; - } - // The code below may catch exceptions if other parameters of the request // won't pass further validation against the present configuration of the request // processing service. @@ -219,17 +184,6 @@ void WorkerProcessor::enqueueForDeletion(string const& id, int32_t priority, replica::Lock lock(_mtx, _context(__func__)); - // Verify a scope of the request to ensure it won't duplicate or interfere (with) - // existing requests in the active (non-completed) queues. A reason why we're ignoring - // the completed is that this replica may have already been deleted from this worker. - - for (auto&& ptr : _newRequests) { - if (::ifDuplicateRequest(response, ptr, request)) return; - } - for (auto&& entry : _inProgressRequests) { - if (::ifDuplicateRequest(response, entry.second, request)) return; - } - // The code below may catch exceptions if other parameters of the request // won't pass further validation against the present configuration of the request // processing service. From 24e9a34656f37320180df02eeaa0a44681f13af6 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Mon, 9 Dec 2024 23:28:46 -0800 Subject: [PATCH 2/9] Eliminated an intermediate base class from the hiarachy of the Replication classes The class is now obsoletite. --- src/replica/requests/CMakeLists.txt | 1 - src/replica/requests/DeleteRequest.cc | 3 +- src/replica/requests/DeleteRequest.h | 4 +- src/replica/requests/DirectorIndexRequest.cc | 2 +- src/replica/requests/DirectorIndexRequest.h | 4 +- src/replica/requests/DisposeRequest.cc | 2 +- src/replica/requests/DisposeRequest.h | 4 +- src/replica/requests/EchoRequest.cc | 2 +- src/replica/requests/EchoRequest.h | 4 +- src/replica/requests/FindAllRequest.cc | 3 +- src/replica/requests/FindAllRequest.h | 4 +- src/replica/requests/FindRequest.cc | 2 +- src/replica/requests/FindRequest.h | 4 +- src/replica/requests/ReplicationRequest.cc | 3 +- src/replica/requests/ReplicationRequest.h | 4 +- src/replica/requests/Request.cc | 51 ++++++++- src/replica/requests/Request.h | 41 +++++-- src/replica/requests/RequestMessenger.cc | 102 ------------------ src/replica/requests/RequestMessenger.h | 97 ----------------- .../requests/ServiceManagementRequestBase.cc | 3 +- .../requests/ServiceManagementRequestBase.h | 4 +- src/replica/requests/SqlRequest.cc | 2 +- src/replica/requests/SqlRequest.h | 4 +- src/replica/requests/StatusRequest.cc | 3 +- src/replica/requests/StatusRequest.h | 4 +- src/replica/requests/StopRequest.cc | 3 +- src/replica/requests/StopRequest.h | 4 +- 27 files changed, 116 insertions(+), 248 deletions(-) delete mode 100644 src/replica/requests/RequestMessenger.cc delete mode 100644 src/replica/requests/RequestMessenger.h diff --git a/src/replica/requests/CMakeLists.txt b/src/replica/requests/CMakeLists.txt index 58ec36d006..2355791a97 100644 --- a/src/replica/requests/CMakeLists.txt +++ b/src/replica/requests/CMakeLists.txt @@ -11,7 +11,6 @@ target_sources(replica_requests PRIVATE MessengerConnector.cc ReplicationRequest.cc Request.cc - RequestMessenger.cc RequestTracker.cc ServiceManagementRequest.cc ServiceManagementRequestBase.cc diff --git a/src/replica/requests/DeleteRequest.cc b/src/replica/requests/DeleteRequest.cc index 9b3dfb9fe8..6465963e75 100644 --- a/src/replica/requests/DeleteRequest.cc +++ b/src/replica/requests/DeleteRequest.cc @@ -64,8 +64,7 @@ DeleteRequest::Ptr DeleteRequest::createAndStart(shared_ptr const& c DeleteRequest::DeleteRequest(shared_ptr const& controller, string const& workerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REPLICA_DELETE", workerName, priority, keepTracking, - ::disposeRequired), + : Request(controller, "REPLICA_DELETE", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), _onFinish(onFinish) { diff --git a/src/replica/requests/DeleteRequest.h b/src/replica/requests/DeleteRequest.h index bab621f174..b83566ee8d 100644 --- a/src/replica/requests/DeleteRequest.h +++ b/src/replica/requests/DeleteRequest.h @@ -28,7 +28,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" #include "replica/util/ReplicaInfo.h" @@ -44,7 +44,7 @@ namespace lsst::qserv::replica { * Class DeleteRequest represents a transient state of the replica deletion * requests within the master controller for deleting replicas. */ -class DeleteRequest : public RequestMessenger { +class DeleteRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/DirectorIndexRequest.cc b/src/replica/requests/DirectorIndexRequest.cc index bc1b144ed9..fb7e99e169 100644 --- a/src/replica/requests/DirectorIndexRequest.cc +++ b/src/replica/requests/DirectorIndexRequest.cc @@ -76,7 +76,7 @@ DirectorIndexRequest::DirectorIndexRequest(std::shared_ptr const& co string const& directorTable, unsigned int chunk, bool hasTransactions, TransactionId transactionId, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "INDEX", workerName, priority, keepTracking, ::disposeRequired), + : Request(controller, "INDEX", workerName, priority, keepTracking, ::disposeRequired), _database(database), _directorTable(directorTable), _chunk(chunk), diff --git a/src/replica/requests/DirectorIndexRequest.h b/src/replica/requests/DirectorIndexRequest.h index 17d24e0141..71996cb476 100644 --- a/src/replica/requests/DirectorIndexRequest.h +++ b/src/replica/requests/DirectorIndexRequest.h @@ -30,7 +30,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" // Forward declarations @@ -56,7 +56,7 @@ std::ostream& operator<<(std::ostream& os, DirectorIndexRequestInfo const& info) * Class DirectorIndexRequest extracts and returns data to be loaded into * the "director" index. */ -class DirectorIndexRequest : public RequestMessenger { +class DirectorIndexRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/DisposeRequest.cc b/src/replica/requests/DisposeRequest.cc index 0f57b28d7e..ec37516bba 100644 --- a/src/replica/requests/DisposeRequest.cc +++ b/src/replica/requests/DisposeRequest.cc @@ -82,7 +82,7 @@ DisposeRequest::Ptr DisposeRequest::createAndStart(shared_ptr const& DisposeRequest::DisposeRequest(shared_ptr const& controller, string const& workerName, std::vector const& targetIds, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "DISPOSE", workerName, priority, keepTracking, ::disposeRequiredNo), + : Request(controller, "DISPOSE", workerName, priority, keepTracking, ::disposeRequiredNo), _targetIds(targetIds), _onFinish(onFinish) {} diff --git a/src/replica/requests/DisposeRequest.h b/src/replica/requests/DisposeRequest.h index cd1bbf13c9..f55b136a13 100644 --- a/src/replica/requests/DisposeRequest.h +++ b/src/replica/requests/DisposeRequest.h @@ -30,7 +30,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" // Forward declarations @@ -70,7 +70,7 @@ class DisposeRequestResult { * * @note Requests of this type don't have any persistent states. */ -class DisposeRequest : public RequestMessenger { +class DisposeRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/EchoRequest.cc b/src/replica/requests/EchoRequest.cc index a066445434..73e6900740 100644 --- a/src/replica/requests/EchoRequest.cc +++ b/src/replica/requests/EchoRequest.cc @@ -64,7 +64,7 @@ EchoRequest::Ptr EchoRequest::createAndStart(shared_ptr const& contr EchoRequest::EchoRequest(shared_ptr const& controller, string const& workerName, string const& data, uint64_t delay, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "TEST_ECHO", workerName, priority, keepTracking, ::disposeRequired), + : Request(controller, "TEST_ECHO", workerName, priority, keepTracking, ::disposeRequired), _data(data), _delay(delay), _onFinish(onFinish) {} diff --git a/src/replica/requests/EchoRequest.h b/src/replica/requests/EchoRequest.h index 870d353b49..beb373a2c0 100644 --- a/src/replica/requests/EchoRequest.h +++ b/src/replica/requests/EchoRequest.h @@ -29,7 +29,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" // Forward declarations @@ -45,7 +45,7 @@ namespace lsst::qserv::replica { * the controller-worker protocol and the worker-side framework. * These requests have no side effects. */ -class EchoRequest : public RequestMessenger { +class EchoRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/FindAllRequest.cc b/src/replica/requests/FindAllRequest.cc index b4fa178fe7..a03b9d928f 100644 --- a/src/replica/requests/FindAllRequest.cc +++ b/src/replica/requests/FindAllRequest.cc @@ -64,8 +64,7 @@ FindAllRequest::Ptr FindAllRequest::createAndStart(shared_ptr const& FindAllRequest::FindAllRequest(shared_ptr const& controller, string const& workerName, string const& database, bool saveReplicaInfo, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REPLICA_FIND_ALL", workerName, priority, keepTracking, - ::disposeRequired), + : Request(controller, "REPLICA_FIND_ALL", workerName, priority, keepTracking, ::disposeRequired), _database(database), _saveReplicaInfo(saveReplicaInfo), _onFinish(onFinish) { diff --git a/src/replica/requests/FindAllRequest.h b/src/replica/requests/FindAllRequest.h index 41a75582c2..0e20575892 100644 --- a/src/replica/requests/FindAllRequest.h +++ b/src/replica/requests/FindAllRequest.h @@ -28,7 +28,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" #include "replica/util/ReplicaInfo.h" @@ -44,7 +44,7 @@ namespace lsst::qserv::replica { * Class FindAllRequest represents known replicas lookup requests within * the master controller. */ -class FindAllRequest : public RequestMessenger { +class FindAllRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/FindRequest.cc b/src/replica/requests/FindRequest.cc index 4a23224d8f..29b3dfd1e2 100644 --- a/src/replica/requests/FindRequest.cc +++ b/src/replica/requests/FindRequest.cc @@ -65,7 +65,7 @@ FindRequest::Ptr FindRequest::createAndStart(shared_ptr const& contr FindRequest::FindRequest(shared_ptr const& controller, string const& workerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, bool computeCheckSum, bool keepTracking) - : RequestMessenger(controller, "REPLICA_FIND", workerName, priority, keepTracking, ::disposeRequired), + : Request(controller, "REPLICA_FIND", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), _computeCheckSum(computeCheckSum), diff --git a/src/replica/requests/FindRequest.h b/src/replica/requests/FindRequest.h index 686d1b5e86..d2f55b506c 100644 --- a/src/replica/requests/FindRequest.h +++ b/src/replica/requests/FindRequest.h @@ -28,7 +28,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" #include "replica/util/ReplicaInfo.h" @@ -44,7 +44,7 @@ namespace lsst::qserv::replica { * Class FindRequest represents a transient state of the replica lookup * requests within the master controller for deleting replicas. */ -class FindRequest : public RequestMessenger { +class FindRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/ReplicationRequest.cc b/src/replica/requests/ReplicationRequest.cc index 4e9da5e10d..0660d042ca 100644 --- a/src/replica/requests/ReplicationRequest.cc +++ b/src/replica/requests/ReplicationRequest.cc @@ -64,8 +64,7 @@ ReplicationRequest::ReplicationRequest(shared_ptr const& controller, string const& sourceWorkerName, string const& database, unsigned int chunk, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REPLICA_CREATE", workerName, priority, keepTracking, - ::disposeRequired), + : Request(controller, "REPLICA_CREATE", workerName, priority, keepTracking, ::disposeRequired), _database(database), _chunk(chunk), _sourceWorkerName(sourceWorkerName), diff --git a/src/replica/requests/ReplicationRequest.h b/src/replica/requests/ReplicationRequest.h index 74c08c6043..1cdbdaf379 100644 --- a/src/replica/requests/ReplicationRequest.h +++ b/src/replica/requests/ReplicationRequest.h @@ -31,7 +31,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" #include "replica/util/ReplicaInfo.h" @@ -47,7 +47,7 @@ namespace lsst::qserv::replica { * Class ReplicationRequest represents a transient state of requests * within the master controller for creating replicas. */ -class ReplicationRequest : public RequestMessenger { +class ReplicationRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/Request.cc b/src/replica/requests/Request.cc index 1e000e3671..b7d7fb417a 100644 --- a/src/replica/requests/Request.cc +++ b/src/replica/requests/Request.cc @@ -34,6 +34,7 @@ // Qserv headers #include "replica/config/Configuration.h" #include "replica/contr/Controller.h" +#include "replica/requests/Messenger.h" #include "replica/services/ServiceProvider.h" #include "replica/util/ProtocolBuffer.h" @@ -283,8 +284,34 @@ void Request::finish(replica::Lock const& lock, ExtendedState extendedState) { // Stop the timer if the one is still running _requestExpirationTimer.cancel(); - // Let a subclass to run its own finalization if needed - finishImpl(lock); + // Make sure the request (if any) has been eliminated from the messenger. + // This operation is unnecessary if the request has successfully finished, + // in which case it's guaranteed that no outstanding message for the request + // will be at the messenger's queue. This optimization also reduces extra + // locking (and delays) in the messenger because the operation is synchronized. + if (extendedState != Request::ExtendedState::SUCCESS) { + controller()->serviceProvider()->messenger()->cancel(workerName(), id()); + } + + // Tell the worker to dispose the request if a subclass made such requirement, + // and only if the request has successfully finished. This will remove the request + // from the worker's "finished" queue and release memory taken by the request + // much earlier than after request expiration deadline. + // Don't dispose requests in other states since any such actions may result in + // unnecessary increase of the traffic on a communication channel with the worker + // and increase processing latency (and increasing a probability of running into + // the Controller side timeouts while waiting for the completion of the requests) + // of the on-going or queued requests. + // Requests in other states ended up at workers would be automatically disposed + // by workers after requests's expiration deadlines. + if (disposeRequired() && (extendedState == Request::ExtendedState::SUCCESS)) { + // Don't require any callback notification for the completion of + // the operation. This will also prevent incrementing a shared pointer + // counter for the current object. + dispose(lock, priority(), nullptr); + } + + // Notify a subscriber (if any) about the completion of the request. notify(lock); // Unblock threads (if any) waiting on the synchronization call @@ -322,6 +349,26 @@ void Request::setState(replica::Lock const& lock, State newState, ExtendedState savePersistentState(lock); } +void Request::dispose(replica::Lock const& lock, int priority, OnDisposeCallbackType const& onFinish) { + LOGS(_log, LOG_LVL_DEBUG, context() << __func__); + + buffer()->resize(); + + ProtocolRequestHeader hdr; + hdr.set_id(id()); + hdr.set_type(ProtocolRequestHeader::REQUEST); + hdr.set_management_type(ProtocolManagementRequestType::REQUEST_DISPOSE); + hdr.set_instance_id(controller()->serviceProvider()->instanceId()); + + buffer()->serialize(hdr); + ProtocolRequestDispose message; + message.add_ids(id()); + buffer()->serialize(message); + + controller()->serviceProvider()->messenger()->send(workerName(), id(), priority, + buffer(), onFinish); +} + boost::asio::io_service& Request::_ioService() { return controller()->serviceProvider()->io_service(); } } // namespace lsst::qserv::replica diff --git a/src/replica/requests/Request.h b/src/replica/requests/Request.h index 82f47d65e5..a14d72f122 100644 --- a/src/replica/requests/Request.h +++ b/src/replica/requests/Request.h @@ -24,6 +24,7 @@ // System headers #include #include +#include #include #include #include @@ -252,6 +253,17 @@ class Request : public std::enable_shared_from_this { static void defaultPrinter(Ptr const& ptr) { ptr->print(std::cout, true); } protected: + /** + * The callaback type for notifications on completion of the request + * disposal operation. The first parameter (std::string const&) of the callback + * is the unique identifier of a request, the second parameter (bool) is a flag + * indicating a success or a failure of the operation, and the last parameter + * (ProtocolResponseDispose const&) represents a result of the operation reported + * by the worker service. + */ + typedef std::function + OnDisposeCallbackType; + /** * Construct the request with the pointer to the services provider. * @@ -379,13 +391,6 @@ class Request : public std::enable_shared_from_this { */ void finish(replica::Lock const& lock, ExtendedState extendedState); - /** - * This method is supposed to be provided by subclasses - * to finalize request processing as required by the subclass. - * @param lock A lock on Request::_mtx must be acquired before calling this method. - */ - virtual void finishImpl(replica::Lock const& lock) = 0; - /** * This method is supposed to be provided by subclasses to save the request's * state into a database. @@ -438,6 +443,22 @@ class Request : public std::enable_shared_from_this { */ void setState(replica::Lock const& lock, State state, ExtendedState extendedStat = ExtendedState::NONE); + /** + * Initiate the request disposal at the worker server. This method is automatically + * called upon succesfull completion of requests for which the flag 'disposeRequired' + * was set during request object construction. However, the streaming requests + * that are designed to make more than one trip to the worker under the same request + * identifier may also explicitly call this method upon completing intermediate + * requests. That is normally done to expedite the garbage collection of the worker + * requests and prevent excessive memory build up (or keeping other resources) + * at the worker. + * @param lock The lock on Request::_mtx must be acquired before calling this method. + * @param priority The desired priority level of the operation. + * @param onFinish The optional callback to be called upon the completion of + * the request disposal operation. + */ + void dispose(replica::Lock const& lock, int priority, OnDisposeCallbackType const& onFinish = nullptr); + /** * This method will begin an optional user protocol upon a completion * of a job (if any user-supplied callback function was provided). @@ -501,6 +522,12 @@ class Request : public std::enable_shared_from_this { /// @return The global IO service object retreived from the service provider boost::asio::io_service& _ioService(); + /** + * This method finalizes request processing. + * @param lock A lock on Request::_mtx must be acquired before calling this method. + */ + void finishImpl(replica::Lock const& lock); + /// The global counter for the number of instances of any subclasses static std::atomic _numClassInstances; diff --git a/src/replica/requests/RequestMessenger.cc b/src/replica/requests/RequestMessenger.cc deleted file mode 100644 index cfb4e5e8a3..0000000000 --- a/src/replica/requests/RequestMessenger.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ - -// Class header -#include "replica/requests/RequestMessenger.h" - -// Qserv headers -#include "replica/contr/Controller.h" -#include "replica/proto/protocol.pb.h" -#include "replica/requests/Messenger.h" -#include "replica/services/ServiceProvider.h" -#include "replica/util/ProtocolBuffer.h" - -// LSST headers -#include "lsst/log/Log.h" - -using namespace std; - -namespace { - -LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.RequestMessenger"); - -} // namespace - -namespace lsst::qserv::replica { - -RequestMessenger::RequestMessenger(shared_ptr const& controller, string const& type, - string const& workerName, int priority, bool keepTracking, - bool disposeRequired) - : Request(controller, type, workerName, priority, keepTracking, disposeRequired) {} - -void RequestMessenger::finishImpl(replica::Lock const& lock) { - LOGS(_log, LOG_LVL_DEBUG, context() << __func__); - - // Make sure the request (if any) has been eliminated from the messenger. - // This operation is unnecessary if the request has successfully finished, - // in which case it's guaranteed that no outstanding message for the request - // will be at the messenger's queue. This optimization also reduces extra - // locking (and delays) in the messenger because the operation is synchronized. - if (extendedState() != Request::ExtendedState::SUCCESS) { - controller()->serviceProvider()->messenger()->cancel(workerName(), id()); - } - - // Tell the worker to dispose the request if a subclass made such requirement, - // and only if the request has successfully finished. This will remove the request - // from the worker's "finished" queue and release memory taken by the request - // much earlier than after request expiration deadline. - // Don't dispose requests in other states since any such actions may result in - // unnecessary increase of the traffic on a communication channel with the worker - // and increase processing latency (and increasing a probability of running into - // the Controller side timeouts while waiting for the completion of the requests) - // of the on-going or queued requests. - // Requests in other states ended up at workers would be automatically disposed - // by workers after requests's expiration deadlines. - if (disposeRequired() && (extendedState() == Request::ExtendedState::SUCCESS)) { - // Don't require any callback notification for the completion of - // the operation. This will also prevent incrementing a shared pointer - // counter for the current object. - dispose(lock, priority(), nullptr); - } -} - -void RequestMessenger::dispose(replica::Lock const& lock, int priority, - OnDisposeCallbackType const& onFinish) { - LOGS(_log, LOG_LVL_DEBUG, context() << __func__); - - buffer()->resize(); - - ProtocolRequestHeader hdr; - hdr.set_id(id()); - hdr.set_type(ProtocolRequestHeader::REQUEST); - hdr.set_management_type(ProtocolManagementRequestType::REQUEST_DISPOSE); - hdr.set_instance_id(controller()->serviceProvider()->instanceId()); - - buffer()->serialize(hdr); - ProtocolRequestDispose message; - message.add_ids(id()); - buffer()->serialize(message); - - controller()->serviceProvider()->messenger()->send(workerName(), id(), priority, - buffer(), onFinish); -} - -} // namespace lsst::qserv::replica diff --git a/src/replica/requests/RequestMessenger.h b/src/replica/requests/RequestMessenger.h deleted file mode 100644 index e91996c1d8..0000000000 --- a/src/replica/requests/RequestMessenger.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * LSST Data Management System - * - * This product includes software developed by the - * LSST Project (http://www.lsst.org/). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the LSST License Statement and - * the GNU General Public License along with this program. If not, - * see . - */ -#ifndef LSST_QSERV_REPLICA_REQUESTMESSENGER_H -#define LSST_QSERV_REPLICA_REQUESTMESSENGER_H - -// System headers -#include -#include -#include - -// Qserv headers -#include "replica/proto/protocol.pb.h" -#include "replica/requests/Request.h" -#include "replica/util/Common.h" -#include "replica/util/Mutex.h" - -// Forward declarations -namespace lsst::qserv::replica { -class Controller; -} // namespace lsst::qserv::replica - -// This header declarations -namespace lsst::qserv::replica { - -/** - * Class RequestMessenger is a base class for a family of requests within - * the replication Controller server. - */ -class RequestMessenger : public Request { -public: - /// The pointer type for instances of the class - typedef std::shared_ptr Ptr; - - /// The callaback type for notifications on completion of the request - /// disposal operation. The first parameter (std::string const&) of the callback - /// is the unique identifier of a request, the second parameter (bool) is a flag - /// indicating a success or a failure of the operation, and the last parameter - /// (ProtocolResponseDispose const&) represents a result of the operation reported - /// by the worker service. - typedef std::function - OnDisposeCallbackType; - - RequestMessenger() = delete; - RequestMessenger(RequestMessenger const&) = delete; - RequestMessenger& operator=(RequestMessenger const&) = delete; - - ~RequestMessenger() override = default; - -protected: - /** - * Construct the request with the pointer to the services provider. - * @return A pointer to the created object. - */ - RequestMessenger(std::shared_ptr const& controller, std::string const& type, - std::string const& workerName, int priority, bool keepTracking, bool disposeRequired); - - /// @see Request::finishImpl() - void finishImpl(replica::Lock const& lock) override; - - /** - * Initiate the request disposal at the worker server. This method is automatically - * called upon succesfull completion of requests for which the flag 'disposeRequired' - * was set during request object construction. However, the streaming requests - * that are designed to make more than one trip to the worker under the same request - * identifier may also explicitly call this method upon completing intermediate - * requests. That is normally done to expedite the garbage collection of the worker - * requests and prevent excessive memory build up (or keeping other resources) - * at the worker. - * @param lock The lock on Request::_mtx must be acquired before calling this method. - * @param priority The desired priority level of the operation. - * @param onFinish The optional callback to be called upon the completion of - * the request disposal operation. - */ - void dispose(replica::Lock const& lock, int priority, OnDisposeCallbackType const& onFinish = nullptr); -}; - -} // namespace lsst::qserv::replica - -#endif // LSST_QSERV_REPLICA_REQUESTMESSENGER_H diff --git a/src/replica/requests/ServiceManagementRequestBase.cc b/src/replica/requests/ServiceManagementRequestBase.cc index f56a757b3b..f5d541d315 100644 --- a/src/replica/requests/ServiceManagementRequestBase.cc +++ b/src/replica/requests/ServiceManagementRequestBase.cc @@ -168,8 +168,7 @@ ServiceManagementRequestBase::ServiceManagementRequestBase(shared_ptr Ptr; diff --git a/src/replica/requests/SqlRequest.cc b/src/replica/requests/SqlRequest.cc index 04c951226d..7a24d9edc4 100644 --- a/src/replica/requests/SqlRequest.cc +++ b/src/replica/requests/SqlRequest.cc @@ -70,7 +70,7 @@ void SqlRequest::extendedPrinter(Ptr const& ptr) { SqlRequest::SqlRequest(shared_ptr const& controller, std::string const& requestName, string const& workerName, uint64_t maxRows, int priority, bool keepTracking) - : RequestMessenger(controller, requestName, workerName, priority, keepTracking, ::disposeRequired) { + : Request(controller, requestName, workerName, priority, keepTracking, ::disposeRequired) { // Partial initialization of the request body's content. Other members // will be set in the request type-specific subclasses. requestBody.set_max_rows(maxRows); diff --git a/src/replica/requests/SqlRequest.h b/src/replica/requests/SqlRequest.h index a7b4dd3439..e73e1836de 100644 --- a/src/replica/requests/SqlRequest.h +++ b/src/replica/requests/SqlRequest.h @@ -31,7 +31,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/requests/SqlResultSet.h" #include "replica/util/Common.h" @@ -53,7 +53,7 @@ namespace lsst::qserv::replica { * In case of a successful completion of a request an object of this request class * will receive a result set (if any) of the query. */ -class SqlRequest : public RequestMessenger { +class SqlRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/StatusRequest.cc b/src/replica/requests/StatusRequest.cc index 1826e8c6a0..daa21ff25e 100644 --- a/src/replica/requests/StatusRequest.cc +++ b/src/replica/requests/StatusRequest.cc @@ -58,8 +58,7 @@ StatusRequest::Ptr StatusRequest::createAndStart(shared_ptr const& c StatusRequest::StatusRequest(shared_ptr const& controller, string const& workerName, string const& targetRequestId, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REQUEST_STATUS", workerName, priority, keepTracking, - ::disposeRequiredNo), + : Request(controller, "REQUEST_STATUS", workerName, priority, keepTracking, ::disposeRequiredNo), _targetRequestId(targetRequestId), _onFinish(onFinish) {} diff --git a/src/replica/requests/StatusRequest.h b/src/replica/requests/StatusRequest.h index 150f52cb81..4c3c3ea6bb 100644 --- a/src/replica/requests/StatusRequest.h +++ b/src/replica/requests/StatusRequest.h @@ -29,7 +29,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" #include "replica/util/Performance.h" @@ -44,7 +44,7 @@ namespace lsst::qserv::replica { /** * Class StatusRequest is used for checking a status of the previously submitted requests. */ -class StatusRequest : public RequestMessenger { +class StatusRequest : public Request { public: typedef std::shared_ptr Ptr; diff --git a/src/replica/requests/StopRequest.cc b/src/replica/requests/StopRequest.cc index 4f1bfd3a3d..9837abc6ad 100644 --- a/src/replica/requests/StopRequest.cc +++ b/src/replica/requests/StopRequest.cc @@ -54,8 +54,7 @@ StopRequest::Ptr StopRequest::createAndStart(shared_ptr const& contr StopRequest::StopRequest(shared_ptr const& controller, string const& workerName, string const& targetRequestId, CallbackType const& onFinish, int priority, bool keepTracking) - : RequestMessenger(controller, "REQUEST_STOP", workerName, priority, keepTracking, - ::disposeRequiredNo), + : Request(controller, "REQUEST_STOP", workerName, priority, keepTracking, ::disposeRequiredNo), _targetRequestId(targetRequestId), _onFinish(onFinish) {} diff --git a/src/replica/requests/StopRequest.h b/src/replica/requests/StopRequest.h index 72ddbd6f14..46d63c4bba 100644 --- a/src/replica/requests/StopRequest.h +++ b/src/replica/requests/StopRequest.h @@ -29,7 +29,7 @@ // Qserv headers #include "replica/proto/protocol.pb.h" -#include "replica/requests/RequestMessenger.h" +#include "replica/requests/Request.h" #include "replica/util/Common.h" // Forward declarations @@ -43,7 +43,7 @@ namespace lsst::qserv::replica { /** * Class StopRequest is used for canceling the previously submitted requests. */ -class StopRequest : public RequestMessenger { +class StopRequest : public Request { public: typedef std::shared_ptr Ptr; From 2e066ef7d9c34a42cbaee9ed5789c3c4f77fede4 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Tue, 10 Dec 2024 20:04:37 -0800 Subject: [PATCH 3/9] Incremented the version number of the REST API to 40 --- src/admin/python/lsst/qserv/admin/replicationInterface.py | 2 +- src/http/ChttpMetaModule.cc | 2 +- src/http/MetaModule.cc | 2 +- src/www/qserv/js/Common.js | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/admin/python/lsst/qserv/admin/replicationInterface.py b/src/admin/python/lsst/qserv/admin/replicationInterface.py index ad33a3e233..d36d4edf52 100644 --- a/src/admin/python/lsst/qserv/admin/replicationInterface.py +++ b/src/admin/python/lsst/qserv/admin/replicationInterface.py @@ -41,7 +41,7 @@ chunk_info_file = "chunk_info.json" -repl_api_version = 39 +repl_api_version = 40 _log = logging.getLogger(__name__) diff --git a/src/http/ChttpMetaModule.cc b/src/http/ChttpMetaModule.cc index 7495fe206a..7b1ef608c0 100644 --- a/src/http/ChttpMetaModule.cc +++ b/src/http/ChttpMetaModule.cc @@ -37,7 +37,7 @@ string const adminAuthKey; namespace lsst::qserv::http { -unsigned int const ChttpMetaModule::version = 39; +unsigned int const ChttpMetaModule::version = 40; void ChttpMetaModule::process(string const& context, nlohmann::json const& info, httplib::Request const& req, httplib::Response& resp, string const& subModuleName) { diff --git a/src/http/MetaModule.cc b/src/http/MetaModule.cc index 2c02d2a230..a230b94a72 100644 --- a/src/http/MetaModule.cc +++ b/src/http/MetaModule.cc @@ -37,7 +37,7 @@ string const adminAuthKey; namespace lsst::qserv::http { -unsigned int const MetaModule::version = 39; +unsigned int const MetaModule::version = 40; void MetaModule::process(string const& context, nlohmann::json const& info, shared_ptr const& req, shared_ptr const& resp, diff --git a/src/www/qserv/js/Common.js b/src/www/qserv/js/Common.js index 0241534258..e5fe56b737 100644 --- a/src/www/qserv/js/Common.js +++ b/src/www/qserv/js/Common.js @@ -6,7 +6,7 @@ function(sqlFormatter, _) { class Common { - static RestAPIVersion = 39; + static RestAPIVersion = 40; static query2text(query, expanded) { if (expanded) { if (query.length > Common._max_expanded_length) { From 6b35e6c08a147973324661f3f45886552c2a0742 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Thu, 12 Dec 2024 11:01:49 -0800 Subject: [PATCH 4/9] Added configuration parameters for the HTTP-based worker replication service Extended Replication Worker's config page on the Web Dashbnoard to display connection parameters of the HTTP-based worker management services. --- src/replica/apps/ConfigAppBase.cc | 3 ++ src/replica/config/ConfigTestData.cc | 10 +++++ src/replica/config/ConfigWorker.cc | 10 ++++- src/replica/config/ConfigWorker.h | 3 ++ src/replica/config/ConfigurationSchema.cc | 45 ++++++++++---------- src/replica/registry/Registry.cc | 5 +++ src/replica/tests/testConfiguration.cc | 15 +++++++ src/www/qserv/js/ReplicationConfigWorkers.js | 7 +++ 8 files changed, 73 insertions(+), 25 deletions(-) diff --git a/src/replica/apps/ConfigAppBase.cc b/src/replica/apps/ConfigAppBase.cc index 24de288ae4..46de0f4d2d 100644 --- a/src/replica/apps/ConfigAppBase.cc +++ b/src/replica/apps/ConfigAppBase.cc @@ -67,6 +67,7 @@ void ConfigAppBase::dumpWorkersAsTable(string const& indent, string const& capti vector isReadOnly; vector dataDir; vector svcHostPort; + vector httpSvcHostPort; vector fsHostPort; vector loaderHostPort; vector loaderTmpDir; @@ -82,6 +83,7 @@ void ConfigAppBase::dumpWorkersAsTable(string const& indent, string const& capti isReadOnly.push_back(worker.isReadOnly ? "yes" : "no"); dataDir.push_back(worker.dataDir); svcHostPort.push_back(worker.svcHost.addr + ":" + to_string(worker.svcPort)); + httpSvcHostPort.push_back(worker.httpSvcHost.addr + ":" + to_string(worker.httpSvcPort)); fsHostPort.push_back(worker.fsHost.addr + ":" + to_string(worker.fsPort)); loaderHostPort.push_back(worker.loaderHost.addr + ":" + to_string(worker.loaderPort)); loaderTmpDir.push_back(worker.loaderTmpDir); @@ -98,6 +100,7 @@ void ConfigAppBase::dumpWorkersAsTable(string const& indent, string const& capti table.addColumn("read-only", isReadOnly); table.addColumn("Qserv data directory", dataDir, util::ColumnTablePrinter::LEFT); table.addColumn("Repl. svc", svcHostPort, util::ColumnTablePrinter::LEFT); + table.addColumn("Repl. http-svc", httpSvcHostPort, util::ColumnTablePrinter::LEFT); table.addColumn("File svc", fsHostPort, util::ColumnTablePrinter::LEFT); table.addColumn("Binary ingest", loaderHostPort, util::ColumnTablePrinter::LEFT); table.addColumn(":tmp", loaderTmpDir, util::ColumnTablePrinter::LEFT); diff --git a/src/replica/config/ConfigTestData.cc b/src/replica/config/ConfigTestData.cc index 1881ef41bc..0f2cd77157 100644 --- a/src/replica/config/ConfigTestData.cc +++ b/src/replica/config/ConfigTestData.cc @@ -45,6 +45,7 @@ map> ConfigTestData::parameters() { {"worker", {"num-threads", "num-svc-processing-threads", + "num-http-svc-threads", "num-fs-processing-threads", "fs-buf-size-bytes", "num-loader-processing-threads", @@ -55,7 +56,9 @@ map> ConfigTestData::parameters() { "async-loader-cleanup-on-resume", "http-max-listen-conn", "http-max-queued-requests", + "http-svc-max-queued-requests", "svc-port", + "http-svc-port", "fs-port", "data-dir", "loader-max-warnings", @@ -114,6 +117,7 @@ json ConfigTestData::data() { {"reconnect-timeout", 500}}); generalObj["worker"] = json::object({{"num-threads", 3}, {"num-svc-processing-threads", 4}, + {"num-http-svc-threads", 10}, {"num-fs-processing-threads", 5}, {"fs-buf-size-bytes", 1024}, {"num-loader-processing-threads", 6}, @@ -124,7 +128,9 @@ json ConfigTestData::data() { {"async-loader-cleanup-on-resume", 0}, {"http-max-listen-conn", 512}, {"http-max-queued-requests", 1024}, + {"http-svc-max-queued-requests", 2048}, {"svc-port", 51000}, + {"http-svc-port", 56000}, {"fs-port", 52000}, {"data-dir", "/data"}, {"loader-max-warnings", 2}, @@ -144,6 +150,8 @@ json ConfigTestData::data() { {"is-read-only", 0}, {"svc-host", {{"addr", "127.0.0.1"}, {"name", "host-A"}}}, {"svc-port", 51001}, + {"http-svc-host", {{"addr", "127.0.0.1"}, {"name", "host-A"}}}, + {"http-svc-port", 56001}, {"fs-host", {{"addr", "127.0.0.1"}, {"name", "host-A"}}}, {"fs-port", 52001}, {"data-dir", "/data/A"}, @@ -168,6 +176,7 @@ json ConfigTestData::data() { {"is-enabled", 1}, {"is-read-only", 1}, {"svc-host", {{"addr", "168.1.1.1"}, {"name", "host-B"}}}, + {"http-svc-host", {{"addr", "168.1.1.1"}, {"name", "host-B"}}}, {"fs-host", {{"addr", "168.1.1.1"}, {"name", "host-B"}}}, {"data-dir", "/data/B"}, {"loader-host", {{"addr", "168.1.1.1"}, {"name", "host-B"}}}, @@ -185,6 +194,7 @@ json ConfigTestData::data() { {"is-enabled", 0}, {"is-read-only", 0}, {"svc-host", {{"addr", "168.1.1.1"}, {"name", "host-C1"}}}, + {"http-svc-host", {{"addr", "168.1.1.7"}, {"name", "host-C7"}}}, {"fs-host", {{"addr", "168.1.1.2"}, {"name", "host-C2"}}}, {"loader-host", {{"addr", "168.1.1.3"}, {"name", "host-C3"}}}, {"exporter-host", {{"addr", "168.1.1.4"}, {"name", "host-C4"}}}, diff --git a/src/replica/config/ConfigWorker.cc b/src/replica/config/ConfigWorker.cc index 10ab411f98..e4f64d193a 100644 --- a/src/replica/config/ConfigWorker.cc +++ b/src/replica/config/ConfigWorker.cc @@ -58,6 +58,9 @@ ConfigWorker::ConfigWorker(json const& obj) { parseRequired(svcHost.addr, obj.at("svc-host"), "addr"); parseRequired(svcHost.name, obj.at("svc-host"), "name"); parseOptional(svcPort, obj, "svc-port"); + parseRequired(httpSvcHost.addr, obj.at("http-svc-host"), "addr"); + parseRequired(httpSvcHost.name, obj.at("http-svc-host"), "name"); + parseOptional(httpSvcPort, obj, "http-svc-port"); parseRequired(fsHost.addr, obj.at("fs-host"), "addr"); parseRequired(fsHost.name, obj.at("fs-host"), "name"); parseOptional(fsPort, obj, "fs-port"); @@ -89,6 +92,8 @@ json ConfigWorker::toJson() const { infoJson["is-read-only"] = isReadOnly ? 1 : 0; infoJson["svc-host"] = svcHost.toJson(); infoJson["svc-port"] = svcPort; + infoJson["http-svc-host"] = httpSvcHost.toJson(); + infoJson["http-svc-port"] = httpSvcPort; infoJson["fs-host"] = fsHost.toJson(); infoJson["fs-port"] = fsPort; infoJson["data-dir"] = dataDir; @@ -107,8 +112,9 @@ json ConfigWorker::toJson() const { bool ConfigWorker::operator==(ConfigWorker const& other) const { return (name == other.name) && (isEnabled == other.isEnabled) && (isReadOnly == other.isReadOnly) && - (svcHost == other.svcHost) && (svcPort == other.svcPort) && (fsHost == other.fsHost) && - (fsPort == other.fsPort) && (dataDir == other.dataDir) && (loaderHost == other.loaderHost) && + (svcHost == other.svcHost) && (svcPort == other.svcPort) && (httpSvcHost == other.httpSvcHost) && + (httpSvcPort == other.httpSvcPort) && (fsHost == other.fsHost) && (fsPort == other.fsPort) && + (dataDir == other.dataDir) && (loaderHost == other.loaderHost) && (loaderPort == other.loaderPort) && (loaderTmpDir == other.loaderTmpDir) && (exporterHost == other.exporterHost) && (exporterPort == other.exporterPort) && (exporterTmpDir == other.exporterTmpDir) && (httpLoaderHost == other.httpLoaderHost) && diff --git a/src/replica/config/ConfigWorker.h b/src/replica/config/ConfigWorker.h index f2005776a5..31a99921cf 100644 --- a/src/replica/config/ConfigWorker.h +++ b/src/replica/config/ConfigWorker.h @@ -68,6 +68,9 @@ class ConfigWorker { ConfigHost svcHost; // The host name (and IP address) of the worker service uint16_t svcPort = 0; // The port number of the worker service + ConfigHost httpSvcHost; // The host name (and IP address) of the HTTP-based worker service + uint16_t httpSvcPort = 0; // The port number of the HTTP-based worker service + ConfigHost fsHost; // The host name (and IP address) of the file service for the worker uint16_t fsPort = 0; // The port number for the file service for the worker diff --git a/src/replica/config/ConfigurationSchema.cc b/src/replica/config/ConfigurationSchema.cc index 7dd639b704..a608358d7d 100644 --- a/src/replica/config/ConfigurationSchema.cc +++ b/src/replica/config/ConfigurationSchema.cc @@ -99,10 +99,7 @@ json const ConfigurationSchema::_schemaJson = json::object( {"default", 5}}}}}, {"controller", {{"num-threads", - {{"description", - "The number of threads managed by BOOST ASIO. Must be greater than 0." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + {{"description", "The number of threads managed by BOOST ASIO. Must be greater than 0."}, {"default", min(8, num_threads)}}}, {"request-timeout-sec", {{"description", @@ -125,9 +122,7 @@ json const ConfigurationSchema::_schemaJson = json::object( {"default", 0}}}, {"http-server-threads", {{"description", - "The number of threads managed by BOOST ASIO for the HTTP server. Must be greater than 0." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + "The number of threads managed by BOOST ASIO for the HTTP server. Must be greater than 0."}, {"default", min(8, num_threads)}}}, {"http-server-port", {{"description", "The port number for the controller's HTTP server. Must be greater than 0."}, @@ -279,22 +274,18 @@ json const ConfigurationSchema::_schemaJson = json::object( {"default", 3600}}}}}, {"worker", {{"num-threads", - {{"description", - "The number of threads managed by BOOST ASIO. Must be greater than 0." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + {{"description", "The number of threads managed by BOOST ASIO. Must be greater than 0."}, {"default", min(8, num_threads)}}}, {"num-svc-processing-threads", + {{"description", "The number of request processing threads in each Replication worker service."}, + {"default", min(8, num_threads)}}}, + {"num-http-svc-threads", {{"description", - "The number of request processing threads in each Replication worker service." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + "The number of threads in each HTTP server frontend of Replication worker service."}, {"default", min(8, num_threads)}}}, {"num-fs-processing-threads", {{"description", - "The number of request processing threads in each Replication worker's file service." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + "The number of request processing threads in each Replication worker's file service."}, {"default", min(8, num_threads)}}}, {"fs-buf-size-bytes", {{"description", @@ -313,14 +304,11 @@ json const ConfigurationSchema::_schemaJson = json::object( {"num-http-loader-processing-threads", {{"description", "The number of request processing threads in each Replication worker's HTTP-based ingest " - "service. Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + "service."}, {"default", min(8, num_threads)}}}, {"num-async-loader-processing-threads", {{"description", - "The number of request processing threads in each Replication worker's ASYNC ingest service." - " Note that setting too many threads may result in a significant memory footprint" - " of the application due to specifics of the Linux memory allocation library."}, + "The number of request processing threads in each Replication worker's ASYNC ingest service."}, {"default", min(8, num_threads)}}}, {"async-loader-auto-resume", {{"description", @@ -355,8 +343,19 @@ json const ConfigurationSchema::_schemaJson = json::object( " the default value unless there are specific reasons to change it."}, {"empty-allowed", 1}, {"default", 0}}}, + {"http-svc-max-queued-requests", + {{"description", + "The maximum number of pending requests, i.e. requests accept()ed by" + " the listener but still waiting to be routed by the HTTP-based Worker Replication server." + " If set to 0 then no specific limit will be enforced. It's recommented to keep" + " the default value unless there are specific reasons to change it."}, + {"empty-allowed", 1}, + {"default", 0}}}, {"svc-port", - {{"description", "The port number for the worker's replication service."}, {"default", 25000}}}, + {{"description", "The port number for the worker replication service."}, {"default", 25000}}}, + {"http-svc-port", + {{"description", "The port number for the HTTP-based worker replication service."}, + {"default", 25005}}}, {"fs-port", {{"description", "The port number for the worker's file service."}, {"default", 25001}}}, {"data-dir", diff --git a/src/replica/registry/Registry.cc b/src/replica/registry/Registry.cc index 0dc832cacb..5cf45a3bf6 100644 --- a/src/replica/registry/Registry.cc +++ b/src/replica/registry/Registry.cc @@ -78,6 +78,9 @@ vector Registry::workers() const { worker.svcHost.addr = hostAddr; worker.svcHost.name = replicationWorker.at("svc-host-name").get(); worker.svcPort = replicationWorker.at("svc-port").get(); + worker.httpSvcHost.addr = hostAddr; + worker.httpSvcHost.name = replicationWorker.at("http-svc-host-name").get(); + worker.httpSvcPort = replicationWorker.at("http-svc-port").get(); worker.fsHost.addr = hostAddr; worker.fsHost.name = replicationWorker.at("fs-host-name").get(); worker.fsPort = replicationWorker.at("fs-port").get(); @@ -118,6 +121,8 @@ void Registry::addWorker(string const& name) const { {{"name", name}, {"svc-host-name", hostName}, {"svc-port", config->get("worker", "svc-port")}, + {"http-svc-host-name", hostName}, + {"http-svc-port", config->get("worker", "http-svc-port")}, {"fs-host-name", hostName}, {"fs-port", config->get("worker", "fs-port")}, {"data-dir", config->get("worker", "data-dir")}, diff --git a/src/replica/tests/testConfiguration.cc b/src/replica/tests/testConfiguration.cc index 990e7f1dd2..198303203e 100644 --- a/src/replica/tests/testConfiguration.cc +++ b/src/replica/tests/testConfiguration.cc @@ -160,6 +160,7 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestReadingGeneralParameters) { BOOST_CHECK(config->get("worker", "num-threads") == 3); BOOST_CHECK(config->get("worker", "num-svc-processing-threads") == 4); + BOOST_CHECK(config->get("worker", "num-http-svc-threads") == 10); BOOST_CHECK(config->get("worker", "num-fs-processing-threads") == 5); BOOST_CHECK(config->get("worker", "fs-buf-size-bytes") == 1024); BOOST_CHECK(config->get("worker", "num-loader-processing-threads") == 6); @@ -170,6 +171,7 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestReadingGeneralParameters) { BOOST_CHECK(config->get("worker", "async-loader-cleanup-on-resume") == 0); BOOST_CHECK(config->get("worker", "http-max-listen-conn") == 512); BOOST_CHECK(config->get("worker", "http-max-queued-requests") == 1024); + BOOST_CHECK(config->get("worker", "http-svc-max-queued-requests") == 2048); BOOST_CHECK(config->get("worker", "loader-max-warnings") == 2); BOOST_CHECK(config->get("worker", "ingest-charset-name") == "latin1"); BOOST_CHECK(config->get("worker", "ingest-num-retries") == 1); @@ -329,6 +331,10 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestModifyingGeneralParameters) { BOOST_REQUIRE_NO_THROW(config->set("worker", "num-svc-processing-threads", 5)); BOOST_CHECK(config->get("worker", "num-svc-processing-threads") == 5); + BOOST_CHECK_THROW(config->set("worker", "num-http-svc-threads", 0), std::invalid_argument); + BOOST_REQUIRE_NO_THROW(config->set("worker", "num-http-svc-threads", 11)); + BOOST_CHECK(config->get("worker", "num-http-svc-threads") == 11); + BOOST_CHECK_THROW(config->set("worker", "num-fs-processing-threads", 0), std::invalid_argument); BOOST_REQUIRE_NO_THROW(config->set("worker", "num-fs-processing-threads", 6)); BOOST_CHECK(config->get("worker", "num-fs-processing-threads") == 6); @@ -377,6 +383,12 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestModifyingGeneralParameters) { BOOST_REQUIRE_NO_THROW(config->set("worker", "http-max-queued-requests", 0)); BOOST_CHECK(config->get("worker", "http-max-queued-requests") == 0); + BOOST_CHECK(config->get("worker", "http-svc-max-queued-requests") == 2048); + BOOST_REQUIRE_NO_THROW(config->set("worker", "http-svc-max-queued-requests", 4096)); + BOOST_CHECK(config->get("worker", "http-svc-max-queued-requests") == 4096); + BOOST_REQUIRE_NO_THROW(config->set("worker", "http-svc-max-queued-requests", 0)); + BOOST_CHECK(config->get("worker", "http-svc-max-queued-requests") == 0); + BOOST_CHECK_THROW(config->set("worker", "loader-max-warnings", 0), std::invalid_argument); BOOST_REQUIRE_NO_THROW(config->set("worker", "loader-max-warnings", 100)); BOOST_CHECK(config->get("worker", "loader-max-warnings") == 100); @@ -469,6 +481,7 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestWorkerParameters) { BOOST_CHECK(workerA.isEnabled); BOOST_CHECK(!workerA.isReadOnly); BOOST_CHECK_EQUAL(workerA.svcHost, hostA); + BOOST_CHECK_EQUAL(workerA.httpSvcHost, hostA); BOOST_CHECK_EQUAL(workerA.fsHost, hostA); BOOST_CHECK_EQUAL(workerA.loaderHost, hostA); BOOST_CHECK_EQUAL(workerA.exporterHost, hostA); @@ -481,6 +494,7 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestWorkerParameters) { BOOST_CHECK(workerB.isEnabled); BOOST_CHECK(workerB.isReadOnly); BOOST_CHECK_EQUAL(workerB.svcHost, hostB); + BOOST_CHECK_EQUAL(workerB.httpSvcHost, hostB); BOOST_CHECK_EQUAL(workerB.fsHost, hostB); BOOST_CHECK_EQUAL(workerB.loaderHost, hostB); BOOST_CHECK_EQUAL(workerB.exporterHost, hostB); @@ -493,6 +507,7 @@ BOOST_AUTO_TEST_CASE(ConfigurationTestWorkerParameters) { BOOST_CHECK(workerC.name == "worker-C"); BOOST_CHECK(!workerC.isEnabled); BOOST_CHECK_EQUAL(workerC.svcHost, ConfigHost({"168.1.1.1", "host-C1"})); + BOOST_CHECK_EQUAL(workerC.httpSvcHost, ConfigHost({"168.1.1.7", "host-C7"})); BOOST_CHECK_EQUAL(workerC.fsHost, ConfigHost({"168.1.1.2", "host-C2"})); BOOST_CHECK_EQUAL(workerC.loaderHost, ConfigHost({"168.1.1.3", "host-C3"})); BOOST_CHECK_EQUAL(workerC.exporterHost, ConfigHost({"168.1.1.4", "host-C4"})); diff --git a/src/www/qserv/js/ReplicationConfigWorkers.js b/src/www/qserv/js/ReplicationConfigWorkers.js index 7670fdd252..6c0ceff5cd 100644 --- a/src/www/qserv/js/ReplicationConfigWorkers.js +++ b/src/www/qserv/js/ReplicationConfigWorkers.js @@ -148,6 +148,13 @@ function(CSSLoader, "dns": worker['svc-host']['name'], "cssClass": "bg-white" }, + { "name": "Replica Management (HTTP)", + "protocol": "http", + "port": worker['http-svc-port'], + "ip": worker['http-svc-host']['addr'], + "dns": worker['http-svc-host']['name'], + "cssClass": "bg-white" + }, { "name": "File Server", "protocol": "binary", "port": worker['fs-port'], From 79ff885ec15e106ae656d558dc5c4daf7a724597 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Wed, 11 Dec 2024 23:49:59 -0800 Subject: [PATCH 5/9] Extended HTTP request body parser (JSON) --- src/http/RequestBodyJSON.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/http/RequestBodyJSON.h b/src/http/RequestBodyJSON.h index 6bf6a8068f..4db07fb523 100644 --- a/src/http/RequestBodyJSON.h +++ b/src/http/RequestBodyJSON.h @@ -79,6 +79,24 @@ class RequestBodyJSON { " is missing in the request body"); } + /** + * The helper method for finding and returning a value of an optional parameter. + * @param obj JSON object to be inspected. + * @param name The name of a parameter. + * @param defaultValue A value to be returned if the parameter wasn't found. + * @return A value of the parameter. + * @throw invalid_argument If the input structure is not the valid JSON object. + */ + template + static T optional(nlohmann::json const& obj, std::string const& name, T const& defaultValue) { + if (not obj.is_object()) { + throw std::invalid_argument("RequestBodyJSON::" + std::string(__func__) + + "[static] parameter 'obj' is not a valid JSON object"); + } + if (obj.find(name) != obj.end()) return obj[name]; + return defaultValue; + } + /** * Find and return a value of a required parameter. * @param name The name of a parameter. From 8afb7a42b3bfe79d7771d1888210e5ac31973946 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Thu, 12 Dec 2024 13:31:17 -0800 Subject: [PATCH 6/9] Thread-safe implementation of the worker performance counter class --- src/replica/util/Performance.cc | 18 ++++++++---------- src/replica/util/Performance.h | 8 +++++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/replica/util/Performance.cc b/src/replica/util/Performance.cc index 3cf1018b49..8e3292d687 100644 --- a/src/replica/util/Performance.cc +++ b/src/replica/util/Performance.cc @@ -77,17 +77,9 @@ ostream& operator<<(ostream& os, Performance const& p) { WorkerPerformance::WorkerPerformance() : receive_time(util::TimeUtils::now()), start_time(0), finish_time(0) {} -uint64_t WorkerPerformance::setUpdateStart() { - uint64_t const t = start_time; - start_time = util::TimeUtils::now(); - return t; -} +uint64_t WorkerPerformance::setUpdateStart() { return start_time.exchange(util::TimeUtils::now()); } -uint64_t WorkerPerformance::setUpdateFinish() { - uint64_t const t = finish_time; - finish_time = util::TimeUtils::now(); - return t; -} +uint64_t WorkerPerformance::setUpdateFinish() { return finish_time.exchange(util::TimeUtils::now()); } unique_ptr WorkerPerformance::info() const { auto ptr = make_unique(); @@ -97,6 +89,12 @@ unique_ptr WorkerPerformance::info() const { return ptr; } +json WorkerPerformance::toJson() const { + return json::object({{"receive_time", receive_time.load()}, + {"start_time", start_time.load()}, + {"finish_time", finish_time.load()}}); +} + ostream& operator<<(ostream& os, WorkerPerformance const& p) { os << "WorkerPerformance " << " receive:" << p.receive_time << " start:" << p.start_time << " finish:" << p.finish_time diff --git a/src/replica/util/Performance.h b/src/replica/util/Performance.h index 79304c5bd0..fcbfd394a4 100644 --- a/src/replica/util/Performance.h +++ b/src/replica/util/Performance.h @@ -28,6 +28,7 @@ */ // System headers +#include #include #include #include @@ -116,6 +117,7 @@ std::ostream& operator<<(std::ostream& os, Performance const& p); */ class WorkerPerformance { public: + /// All (but the request receive time) timestamps will be initialized with 0. WorkerPerformance(); WorkerPerformance(WorkerPerformance const&) = default; WorkerPerformance& operator=(WorkerPerformance const&) = default; @@ -126,9 +128,9 @@ class WorkerPerformance { std::unique_ptr info() const; - uint64_t receive_time = 0; /// Received by a worker service - uint64_t start_time = 0; /// Execution started by a worker service - uint64_t finish_time = 0; /// Execution finished by a worker service + std::atomic receive_time; ///< Received by a worker service + std::atomic start_time; ///< Execution started by a worker service + std::atomic finish_time; ///< Execution finished by a worker service }; std::ostream& operator<<(std::ostream& os, WorkerPerformance const& p); From ebfa447dc7b334b1af14a8051ad8604865a831a0 Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Fri, 13 Dec 2024 22:48:03 -0800 Subject: [PATCH 7/9] Serializers into JSON for MySQL API objects --- src/replica/mysql/DatabaseMySQL.cc | 25 +++++++++++++++++ src/replica/mysql/DatabaseMySQL.h | 15 +++++++++++ src/replica/mysql/DatabaseMySQLRow.cc | 26 ++++++++++++++++++ src/replica/mysql/DatabaseMySQLRow.h | 39 +++++++++++---------------- 4 files changed, 82 insertions(+), 23 deletions(-) diff --git a/src/replica/mysql/DatabaseMySQL.cc b/src/replica/mysql/DatabaseMySQL.cc index 365b676d17..a8ae49e566 100644 --- a/src/replica/mysql/DatabaseMySQL.cc +++ b/src/replica/mysql/DatabaseMySQL.cc @@ -43,6 +43,7 @@ #include "lsst/log/Log.h" using namespace std; +using json = nlohmann::json; namespace { @@ -495,6 +496,30 @@ void Connection::exportField(ProtocolResponseSqlField* ptr, size_t idx) const { ptr->set_type(field.type); } +json Connection::fieldsToJson() const { + _assertQueryContext(); + + json result; + for (size_t i = 0; i < _numFields; ++i) { + auto&& field = _fields[i]; + json f; + f["name"] = field.name; + f["org_name"] = field.org_name; + f["table"] = field.table; + f["org_table"] = field.org_table; + f["db"] = field.db; + f["catalog"] = field.catalog; + f["def"] = field.def; + f["length"] = field.length; + f["max_length"] = field.max_length; + f["flags"] = field.flags; + f["decimals"] = field.decimals; + f["type"] = field.type; + result.push_back(move(f)); + } + return result; +} + bool Connection::next(Row& row) { string const context = "Connection[" + to_string(_id) + "]::" + string(__func__) + "(_inTransaction=" + to_string(_inTransaction ? 1 : 0) + ") "; diff --git a/src/replica/mysql/DatabaseMySQL.h b/src/replica/mysql/DatabaseMySQL.h index 3ea0776f00..cbfb4faf53 100644 --- a/src/replica/mysql/DatabaseMySQL.h +++ b/src/replica/mysql/DatabaseMySQL.h @@ -52,6 +52,7 @@ // Third party headers #include +#include "nlohmann/json.hpp" // Qserv headers #include "replica/mysql/DatabaseMySQLExceptions.h" @@ -422,6 +423,20 @@ class Connection : public std::enable_shared_from_this { */ void exportField(ProtocolResponseSqlField* ptr, size_t idx) const; + /** + * Convert the current result set into a JSON object. + * + * @note The method can be called only upon a successful completion of a query + * which has a result set. Otherwise it will throw an exception. + * + * @see Connection::hasResult + * + * @return a JSON object representing the current result set + * @throw std::logic_error if no SQL statement has ever been executed, or + * if the last query failed. + */ + nlohmann::json fieldsToJson() const; + /** * Move the iterator to the next (first) row of the current result set * and if the iterator is not beyond the last row then initialize an object diff --git a/src/replica/mysql/DatabaseMySQLRow.cc b/src/replica/mysql/DatabaseMySQLRow.cc index 697cf43316..7cf4c00eb9 100644 --- a/src/replica/mysql/DatabaseMySQLRow.cc +++ b/src/replica/mysql/DatabaseMySQLRow.cc @@ -30,11 +30,13 @@ // Qserv headers #include "replica/proto/protocol.pb.h" +#include "util/String.h" // LSST headers #include "lsst/log/Log.h" using namespace std; +using json = nlohmann::json; namespace { @@ -204,4 +206,28 @@ void Row::exportRow(ProtocolResponseSqlRow* ptr) const { } } +json Row::toJson() const { + string const context = "Row::" + string(__func__) + " "; + if (not isValid()) { + throw logic_error(context + "the object is not valid"); + } + json result = json::object(); + result["cells"] = json::array(); + result["nulls"] = json::array(); + json& cellsJson = result["cells"]; + json& nullsJson = result["nulls"]; + for (Cell const& cell : _index2cell) { + char const* ptr = cell.first; + size_t const length = cell.second; + if (nullptr == ptr) { + cellsJson.push_back(string()); + nullsJson.push_back(1); + } else { + cellsJson.push_back(util::String::toHex(ptr, length)); + nullsJson.push_back(0); + } + } + return result; +} + } // namespace lsst::qserv::replica::database::mysql diff --git a/src/replica/mysql/DatabaseMySQLRow.h b/src/replica/mysql/DatabaseMySQLRow.h index 1e303498d5..b5d4a17112 100644 --- a/src/replica/mysql/DatabaseMySQLRow.h +++ b/src/replica/mysql/DatabaseMySQLRow.h @@ -37,6 +37,9 @@ #include #include +// Third party headers +#include "nlohmann/json.hpp" + // Qserv headers #include "replica/mysql/DatabaseMySQLExceptions.h" @@ -110,10 +113,7 @@ class Row { */ Row(); - /// Copy constructor Row(Row const& rhs) = default; - - /// The Assignment operator Row& operator=(Row const& rhs) = default; ~Row() = default; @@ -133,8 +133,6 @@ class Row { // There are two ways to access the values: either by a relative // index of a column in a result set, or by the name of the column. // The second method has some extra (though, minor) overhead. - // - // @see class Row template T getAs(size_t columnIdx) const { @@ -204,41 +202,36 @@ class Row { // Other types bool get(size_t columnIdx, bool& value) const; - bool get(std::string const& columnName, bool& value) const; /** - * @return - * reference to the data cell for the column - * - * @param columnIdx - * the index of a column + * @param columnIdx the index of a column + * @return reference to the data cell for the column */ Cell const& getDataCell(size_t columnIdx) const; /** - * @return - * reference to the data cell for the column - * - * @param columnName - * the name of a column + * @param columnName the name of a column + * @return reference to the data cell for the column */ Cell const& getDataCell(std::string const& columnName) const; /** * Fill a Protobuf object representing a row. - * - * @param ptr - * a valid pointer to the Protobuf object to be populated. - * - * @param std::invalid_argument - * if the input pointer is 0 + * @param ptr a valid pointer to the Protobuf object to be populated. + * @param std::invalid_argument if the input pointer is 0 */ void exportRow(ProtocolResponseSqlRow* ptr) const; + /** + * Convert the current row into a JSON object. + * @return a JSON object representing the current row + */ + nlohmann::json toJson() const; + private: /** - * Mapping column names to the indexes + * Mapping column names to the indexes * * @note * If the pointer is set to 'nullptr' then the object is not From 302c1f9764d875c3862580a257aa6f894caabcaa Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Tue, 10 Dec 2024 20:05:06 -0800 Subject: [PATCH 8/9] The HTTP-based backend of the Replication worker services --- src/replica/apps/WorkerApp.cc | 5 +- src/replica/proto/CMakeLists.txt | 1 + src/replica/proto/Protocol.cc | 184 ++++++ src/replica/proto/Protocol.h | 139 +++++ src/replica/util/Common.cc | 40 +- src/replica/util/Common.h | 34 ++ src/replica/util/Performance.cc | 1 + src/replica/util/Performance.h | 40 +- src/replica/worker/CMakeLists.txt | 12 + .../worker/WorkerCreateReplicaHttpRequest.cc | 467 ++++++++++++++ .../worker/WorkerCreateReplicaHttpRequest.h | 186 ++++++ .../worker/WorkerDeleteReplicaHttpRequest.cc | 116 ++++ .../worker/WorkerDeleteReplicaHttpRequest.h | 99 +++ .../worker/WorkerDirectorIndexHttpRequest.cc | 292 +++++++++ .../worker/WorkerDirectorIndexHttpRequest.h | 149 +++++ src/replica/worker/WorkerEchoHttpRequest.cc | 97 +++ src/replica/worker/WorkerEchoHttpRequest.h | 99 +++ .../WorkerFindAllReplicasHttpRequest.cc | 157 +++++ .../worker/WorkerFindAllReplicasHttpRequest.h | 101 +++ .../worker/WorkerFindReplicaHttpRequest.cc | 233 +++++++ .../worker/WorkerFindReplicaHttpRequest.h | 104 ++++ src/replica/worker/WorkerHttpProcessor.cc | 576 ++++++++++++++++++ src/replica/worker/WorkerHttpProcessor.h | 367 +++++++++++ .../worker/WorkerHttpProcessorThread.cc | 121 ++++ .../worker/WorkerHttpProcessorThread.h | 113 ++++ src/replica/worker/WorkerHttpRequest.cc | 277 +++++++++ src/replica/worker/WorkerHttpRequest.h | 352 +++++++++++ src/replica/worker/WorkerHttpSvc.cc | 149 +++++ src/replica/worker/WorkerHttpSvc.h | 84 +++ src/replica/worker/WorkerHttpSvcMod.cc | 244 ++++++++ src/replica/worker/WorkerHttpSvcMod.h | 172 ++++++ src/replica/worker/WorkerSqlHttpRequest.cc | 425 +++++++++++++ src/replica/worker/WorkerSqlHttpRequest.h | 183 ++++++ 33 files changed, 5589 insertions(+), 30 deletions(-) create mode 100644 src/replica/proto/Protocol.cc create mode 100644 src/replica/proto/Protocol.h create mode 100644 src/replica/worker/WorkerCreateReplicaHttpRequest.cc create mode 100644 src/replica/worker/WorkerCreateReplicaHttpRequest.h create mode 100644 src/replica/worker/WorkerDeleteReplicaHttpRequest.cc create mode 100644 src/replica/worker/WorkerDeleteReplicaHttpRequest.h create mode 100644 src/replica/worker/WorkerDirectorIndexHttpRequest.cc create mode 100644 src/replica/worker/WorkerDirectorIndexHttpRequest.h create mode 100644 src/replica/worker/WorkerEchoHttpRequest.cc create mode 100644 src/replica/worker/WorkerEchoHttpRequest.h create mode 100644 src/replica/worker/WorkerFindAllReplicasHttpRequest.cc create mode 100644 src/replica/worker/WorkerFindAllReplicasHttpRequest.h create mode 100644 src/replica/worker/WorkerFindReplicaHttpRequest.cc create mode 100644 src/replica/worker/WorkerFindReplicaHttpRequest.h create mode 100644 src/replica/worker/WorkerHttpProcessor.cc create mode 100644 src/replica/worker/WorkerHttpProcessor.h create mode 100644 src/replica/worker/WorkerHttpProcessorThread.cc create mode 100644 src/replica/worker/WorkerHttpProcessorThread.h create mode 100644 src/replica/worker/WorkerHttpRequest.cc create mode 100644 src/replica/worker/WorkerHttpRequest.h create mode 100644 src/replica/worker/WorkerHttpSvc.cc create mode 100644 src/replica/worker/WorkerHttpSvc.h create mode 100644 src/replica/worker/WorkerHttpSvcMod.cc create mode 100644 src/replica/worker/WorkerHttpSvcMod.h create mode 100644 src/replica/worker/WorkerSqlHttpRequest.cc create mode 100644 src/replica/worker/WorkerSqlHttpRequest.h diff --git a/src/replica/apps/WorkerApp.cc b/src/replica/apps/WorkerApp.cc index 31c023640a..f1362b91a5 100644 --- a/src/replica/apps/WorkerApp.cc +++ b/src/replica/apps/WorkerApp.cc @@ -39,7 +39,7 @@ #include "replica/services/ServiceProvider.h" #include "replica/util/FileUtils.h" #include "replica/worker/FileServer.h" -#include "replica/worker/WorkerProcessor.h" +#include "replica/worker/WorkerHttpSvc.h" #include "replica/worker/WorkerServer.h" // LSST headers @@ -113,6 +113,9 @@ int WorkerApp::runImpl() { auto const reqProcSvr = WorkerServer::create(serviceProvider(), worker); thread reqProcSvrThread([reqProcSvr]() { reqProcSvr->run(); }); + auto const reqProcHttpSvr = WorkerHttpSvc::create(serviceProvider(), worker); + thread reqProcHttpSvrThread([reqProcHttpSvr]() { reqProcHttpSvr->run(); }); + auto const fileSvr = FileServer::create(serviceProvider(), worker); thread fileSvrThread([fileSvr]() { fileSvr->run(); }); diff --git a/src/replica/proto/CMakeLists.txt b/src/replica/proto/CMakeLists.txt index b61599d8c5..7eb8d830d6 100644 --- a/src/replica/proto/CMakeLists.txt +++ b/src/replica/proto/CMakeLists.txt @@ -4,4 +4,5 @@ add_library(replica_proto OBJECT) target_sources(replica_proto PRIVATE ${REPLICA_PB_SRCS} ${REPLICA_PB_HDRS} + Protocol.cc ) diff --git a/src/replica/proto/Protocol.cc b/src/replica/proto/Protocol.cc new file mode 100644 index 0000000000..eca9499f08 --- /dev/null +++ b/src/replica/proto/Protocol.cc @@ -0,0 +1,184 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/proto/Protocol.h" + +// System headers +#include + +using namespace std; + +namespace lsst::qserv::replica::protocol { + +string toString(SqlRequestType status) { + switch (status) { + case SqlRequestType::QUERY: + return "QUERY"; + case SqlRequestType::CREATE_DATABASE: + return "CREATE_DATABASE"; + case SqlRequestType::DROP_DATABASE: + return "DROP_DATABASE"; + case SqlRequestType::ENABLE_DATABASE: + return "ENABLE_DATABASE"; + case SqlRequestType::DISABLE_DATABASE: + return "DISABLE_DATABASE"; + case SqlRequestType::GRANT_ACCESS: + return "GRANT_ACCESS"; + case SqlRequestType::CREATE_TABLE: + return "CREATE_TABLE"; + case SqlRequestType::DROP_TABLE: + return "DROP_TABLE"; + case SqlRequestType::REMOVE_TABLE_PARTITIONING: + return "REMOVE_TABLE_PARTITIONING"; + case SqlRequestType::DROP_TABLE_PARTITION: + return "DROP_TABLE_PARTITION"; + case SqlRequestType::GET_TABLE_INDEX: + return "GET_TABLE_INDEX"; + case SqlRequestType::CREATE_TABLE_INDEX: + return "CREATE_TABLE_INDEX"; + case SqlRequestType::DROP_TABLE_INDEX: + return "DROP_TABLE_INDEX"; + case SqlRequestType::ALTER_TABLE: + return "ALTER_TABLE"; + case SqlRequestType::TABLE_ROW_STATS: + return "TABLE_ROW_STATS"; + default: + throw logic_error("Unhandled SQL request type: " + to_string(static_cast(status))); + } +} + +string toString(Status status) { + switch (status) { + case Status::CREATED: + return "CREATED"; + case Status::SUCCESS: + return "SUCCESS"; + case Status::QUEUED: + return "QUEUED"; + case Status::IN_PROGRESS: + return "IN_PROGRESS"; + case Status::IS_CANCELLING: + return "IS_CANCELLING"; + case Status::BAD: + return "BAD"; + case Status::FAILED: + return "FAILED"; + case Status::CANCELLED: + return "CANCELLED"; + default: + throw logic_error("Unhandled status: " + to_string(static_cast(status))); + } +} + +string toString(StatusExt extendedStatus) { + switch (extendedStatus) { + case StatusExt::NONE: + return "NONE"; + case StatusExt::INVALID_PARAM: + return "INVALID_PARAM"; + case StatusExt::INVALID_ID: + return "INVALID_ID"; + case StatusExt::FOLDER_STAT: + return "FOLDER_STAT"; + case StatusExt::FOLDER_CREATE: + return "FOLDER_CREATE"; + case StatusExt::FILE_STAT: + return "FILE_STAT"; + case StatusExt::FILE_SIZE: + return "FILE_SIZE"; + case StatusExt::FOLDER_READ: + return "FOLDER_READ"; + case StatusExt::FILE_READ: + return "FILE_READ"; + case StatusExt::FILE_ROPEN: + return "FILE_ROPEN"; + case StatusExt::FILE_CREATE: + return "FILE_CREATE"; + case StatusExt::FILE_OPEN: + return "FILE_OPEN"; + case StatusExt::FILE_RESIZE: + return "FILE_RESIZE"; + case StatusExt::FILE_WRITE: + return "FILE_WRITE"; + case StatusExt::FILE_COPY: + return "FILE_COPY"; + case StatusExt::FILE_DELETE: + return "FILE_DELETE"; + case StatusExt::FILE_RENAME: + return "FILE_RENAME"; + case StatusExt::FILE_EXISTS: + return "FILE_EXISTS"; + case StatusExt::SPACE_REQ: + return "SPACE_REQ"; + case StatusExt::NO_FOLDER: + return "NO_FOLDER"; + case StatusExt::NO_FILE: + return "NO_FILE"; + case StatusExt::NO_ACCESS: + return "NO_ACCESS"; + case StatusExt::NO_SPACE: + return "NO_SPACE"; + case StatusExt::FILE_MTIME: + return "FILE_MTIME"; + case StatusExt::MYSQL_ERROR: + return "MYSQL_ERROR"; + case StatusExt::LARGE_RESULT: + return "LARGE_RESULT"; + case StatusExt::NO_SUCH_TABLE: + return "NO_SUCH_TABLE"; + case StatusExt::NOT_PARTITIONED_TABLE: + return "NOT_PARTITIONED_TABLE"; + case StatusExt::NO_SUCH_PARTITION: + return "NO_SUCH_PARTITION"; + case StatusExt::MULTIPLE: + return "MULTIPLE"; + case StatusExt::OTHER_EXCEPTION: + return "OTHER_EXCEPTION"; + case StatusExt::FOREIGN_INSTANCE: + return "FOREIGN_INSTANCE"; + case StatusExt::DUPLICATE_KEY: + return "DUPLICATE_KEY"; + case StatusExt::CANT_DROP_KEY: + return "CANT_DROP_KEY"; + default: + throw logic_error("Unhandled extended status: " + to_string(static_cast(extendedStatus))); + } +} + +string toString(Status status, StatusExt extendedStatus) { + return toString(status) + "::" + toString(extendedStatus); +} + +string toString(ServiceState state) { + switch (state) { + case ServiceState::SUSPEND_IN_PROGRESS: + return "SUSPEND_IN_PROGRESS"; + case ServiceState::SUSPENDED: + return "SUSPENDED"; + case ServiceState::RUNNING: + return "RUNNING"; + default: + throw logic_error("Unhandled service state: " + to_string(static_cast(state))); + } +} + +} // namespace lsst::qserv::replica::protocol diff --git a/src/replica/proto/Protocol.h b/src/replica/proto/Protocol.h new file mode 100644 index 0000000000..b1fd021cee --- /dev/null +++ b/src/replica/proto/Protocol.h @@ -0,0 +1,139 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_PROTOCOL_H +#define LSST_QSERV_REPLICA_PROTOCOL_H + +// System headers +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// This header declarations +namespace lsst::qserv::replica::protocol { + +/// Subtypes of the SQL requests. +enum class SqlRequestType : int { + + QUERY = 0, + CREATE_DATABASE = 1, + DROP_DATABASE = 2, + ENABLE_DATABASE = 3, ///< in Qserv + DISABLE_DATABASE = 4, ///< in Qserv + GRANT_ACCESS = 5, + CREATE_TABLE = 6, + DROP_TABLE = 7, + REMOVE_TABLE_PARTITIONING = 8, + DROP_TABLE_PARTITION = 9, + GET_TABLE_INDEX = 10, + CREATE_TABLE_INDEX = 11, + DROP_TABLE_INDEX = 12, + ALTER_TABLE = 13, + TABLE_ROW_STATS = 14 +}; + +/// @return the string representation of the SQL request type +std::string toString(SqlRequestType status); + +/// Types of the table indexes specified in the index management requests requests. +enum class SqlIndexSpec : int { DEFAULT = 1, UNIQUE = 2, FULLTEXT = 3, SPATIAL = 4 }; + +/// Status values returned by all request related to operations with +/// replicas. Request management operations always return messages whose types +/// match the return types of the corresponding (original) replica-related requests. +/// Service management requests have their own set of status values. +/// +enum class Status : int { + CREATED = 0, + SUCCESS = 1, + QUEUED = 2, + IN_PROGRESS = 3, + IS_CANCELLING = 4, + BAD = 5, + FAILED = 6, + CANCELLED = 7 +}; + +enum class StatusExt : int { + NONE = 0, ///< Unspecified problem. + INVALID_PARAM = 1, ///< Invalid parameter(s) of a request. + INVALID_ID = 2, ///< An invalid request identifier. + FOLDER_STAT = 4, ///< Failed to obtain fstat() for a folder. + FOLDER_CREATE = 5, ///< Failed to create a folder. + FILE_STAT = 6, ///< Failed to obtain fstat() for a file. + FILE_SIZE = 7, ///< Failed to obtain a size of a file. + FOLDER_READ = 8, ///< Failed to read the contents of a folder. + FILE_READ = 9, ///< Failed to read the contents of a file. + FILE_ROPEN = 10, ///< Failed to open a remote file. + FILE_CREATE = 11, ///< Failed to create a file. + FILE_OPEN = 12, ///< Failed to open a file. + FILE_RESIZE = 13, ///< Failed to resize a file. + FILE_WRITE = 14, ///< Failed to write into a file. + FILE_COPY = 15, ///< Failed to copy a file. + FILE_DELETE = 16, ///< Failed to delete a file. + FILE_RENAME = 17, ///< Failed to rename a file. + FILE_EXISTS = 18, ///< File already exists. + SPACE_REQ = 19, ///< Space availability check failed. + NO_FOLDER = 20, ///< Folder doesn't exist. + NO_FILE = 21, ///< File doesn't exist. + NO_ACCESS = 22, ///< No access to a file or a folder. + NO_SPACE = 23, ///< No space left on a device as required by an operation. + FILE_MTIME = 24, ///< Get/set 'mtime' operation failed. + MYSQL_ERROR = 25, ///< General MySQL error (other than any specific ones listed here). + LARGE_RESULT = 26, ///< Result exceeds a limit set in a request. + NO_SUCH_TABLE = 27, ///< No table found while performing a MySQL operation. + NOT_PARTITIONED_TABLE = 28, ///< The table is not MySQL partitioned as it was expected. + NO_SUCH_PARTITION = 29, ///< No MySQL partition found in a table as it was expected. + MULTIPLE = 30, ///< Multiple unspecified errors encountered when processing a request. + OTHER_EXCEPTION = 31, ///< Other exception not listed here. + FOREIGN_INSTANCE = 32, ///< Detected a request from a Controller serving an unrelated Qserv. + DUPLICATE_KEY = 33, ///< Duplicate key found when creating an index or altering a table schema. + CANT_DROP_KEY = 34 ///< Can't drop a field or a key which doesn't exist. +}; + +/// @return the string representation of the status +std::string toString(Status status); + +/// @return the string representation of the extended status +std::string toString(StatusExt extendedStatus); + +/// @return the string representation of the full status +std::string toString(Status status, StatusExt extendedStatus); + +/// Status of a service. +enum class ServiceState : int { SUSPEND_IN_PROGRESS = 0, SUSPENDED = 1, RUNNING = 2 }; + +/// @return the string representation of the service state +std::string toString(ServiceState state); + +/// The header to be sent with the requests processed through the worker's queueing system. +struct QueuedRequestHdr { + std::string id; + int priority; + unsigned int timeout; + QueuedRequestHdr(std::string const& id_, int priority_, unsigned int timeout_) + : id(id_), priority(priority_), timeout(timeout_) {} + nlohmann::json toJson() const { return {{"id", id}, {"priority", priority}, {"timeout", timeout}}; }; +}; + +} // namespace lsst::qserv::replica::protocol + +#endif // LSST_QSERV_REPLICA_PROTOCOL_H diff --git a/src/replica/util/Common.cc b/src/replica/util/Common.cc index 11c08df7cb..0c98309449 100644 --- a/src/replica/util/Common.cc +++ b/src/replica/util/Common.cc @@ -29,10 +29,9 @@ #include "boost/uuid/uuid.hpp" #include "boost/uuid/uuid_generators.hpp" #include "boost/uuid/uuid_io.hpp" -#include "nlohmann/json.hpp" using namespace std; -using namespace nlohmann; +using json = nlohmann::json; namespace lsst::qserv::replica { @@ -80,6 +79,43 @@ string Generators::uniqueId() { return boost::uuids::to_string(id); } +/////////////////////////////////////////// +// SqlColDef // +/////////////////////////////////////////// + +list parseSqlColumns(json const& columnsJsonArray) { + if (!columnsJsonArray.is_array()) { + throw invalid_argument("lsst::qserv::replica::" + string(__func__) + + " columnsJsonArray is not an array"); + } + list columns; + for (auto const& column : columnsJsonArray) { + columns.emplace_back(column.at("name"), column.at("type")); + } + return columns; +} + +/////////////////////////////////////////// +// SqlIndexDef // +/////////////////////////////////////////// + +SqlIndexDef::SqlIndexDef(json const& indexSpecJson) { + if (!indexSpecJson.is_object()) { + throw invalid_argument("lsst::qserv::replica::" + string(__func__) + + " indexSpecJson is not an object"); + } + spec = indexSpecJson.value("spec", "DEFAULT"); + name = indexSpecJson.at("name"); + comment = indexSpecJson.value("comment", ""); + auto const keysJsonArray = indexSpecJson.at("keys"); + if (!keysJsonArray.is_array()) { + throw invalid_argument("lsst::qserv::replica::" + string(__func__) + " keys is not an array"); + } + for (auto const& key : keysJsonArray) { + keys.emplace_back(key.at("name"), key.at("length"), key.at("ascending")); + } +} + //////////////////////////////////////////// // Parameters of requests // //////////////////////////////////////////// diff --git a/src/replica/util/Common.h b/src/replica/util/Common.h index 3970f771a7..ee1e1fc2d1 100644 --- a/src/replica/util/Common.h +++ b/src/replica/util/Common.h @@ -36,6 +36,9 @@ #include #include +// Third party headers +#include "nlohmann/json.hpp" + // Qserv headers #include "replica/proto/protocol.pb.h" #include "replica/util/Mutex.h" @@ -112,6 +115,13 @@ inline bool operator==(SqlColDef const& lhs, SqlColDef const& rhs) { inline bool operator!=(SqlColDef const& lhs, SqlColDef const& rhs) { return !operator==(lhs, rhs); } +/** + * @param columnsJsonArray The JSON array containing the column definitions. + * @return The list of column definitions. + * @throw std::invalid_argument If the input JSON array is not valid. + */ +std::list parseSqlColumns(nlohmann::json const& columnsJsonArray); + /** * This class is an abstraction for columns within table index * specifications. @@ -130,6 +140,30 @@ class SqlIndexColumn { bool ascending = true; }; +/** + * This class is an abstraction for the index definitions. + */ +class SqlIndexDef { +public: + SqlIndexDef() = default; + + /** + * Parse the definition from then input JSON object. + * @param indexSpecJson The JSON object containing the index definitions. + * @throw std::invalid_argument If the input JSON object is not valid. + */ + SqlIndexDef(nlohmann::json const& indexSpecJson); + + SqlIndexDef(SqlIndexDef const&) = default; + SqlIndexDef& operator=(SqlIndexDef const&) = default; + ~SqlIndexDef() = default; + + std::string spec; + std::string name; + std::string comment; + std::list> keys; +}; + /** * Class ReplicationRequestParams encapsulates parameters of the replica * creation requests. diff --git a/src/replica/util/Performance.cc b/src/replica/util/Performance.cc index 8e3292d687..ae30b0ac3a 100644 --- a/src/replica/util/Performance.cc +++ b/src/replica/util/Performance.cc @@ -30,6 +30,7 @@ #include "lsst/log/Log.h" using namespace std; +using json = nlohmann::json; namespace { diff --git a/src/replica/util/Performance.h b/src/replica/util/Performance.h index fcbfd394a4..15320d08be 100644 --- a/src/replica/util/Performance.h +++ b/src/replica/util/Performance.h @@ -33,6 +33,9 @@ #include #include +// Third party headers +#include "nlohmann/json.hpp" + // Forward declarations namespace lsst::qserv::replica { class ProtocolPerformance; @@ -56,7 +59,6 @@ class Performance { * All (but the request creation one) timestamps will be initialized with 0. */ Performance(); - Performance(Performance const&) = default; Performance& operator=(Performance const&) = default; @@ -64,45 +66,28 @@ class Performance { /** * Update object state with counters from the protocol buffer object - * - * @param workerPerformanceInfo - * counters to be carried over into an internal state + * @param workerPerformanceInfo counters to be carried over into an internal state */ void update(ProtocolPerformance const& workerPerformanceInfo); /** * Update the Controller's 'start' time - * - * @return - * the previous state of the counter + * @return the previous state of the counter */ uint64_t setUpdateStart(); /** * Update the Controller's 'finish' time - * - * @return - * the previous state of the counter + * @return the previous state of the counter */ uint64_t setUpdateFinish(); - /// Created by the Controller - uint64_t c_create_time; - - /// Started by the Controller - uint64_t c_start_time; - - /// Received by a worker service - uint64_t w_receive_time; - - /// Execution started by a worker service - uint64_t w_start_time; - - /// Execution finished by a worker service - uint64_t w_finish_time; - - /// A subscriber notified by the Controller - uint64_t c_finish_time; + uint64_t c_create_time; ///< Created by the Controller + uint64_t c_start_time; ///< Started by the Controller + uint64_t w_receive_time; ///< Received by a worker service + uint64_t w_start_time; ///< Execution started by a worker service + uint64_t w_finish_time; ///< Execution finished by a worker service + uint64_t c_finish_time; ///< A subscriber notified by the Controller }; /// Overloaded streaming operator for class Performance @@ -127,6 +112,7 @@ class WorkerPerformance { uint64_t setUpdateFinish(); std::unique_ptr info() const; + nlohmann::json toJson() const; std::atomic receive_time; ///< Received by a worker service std::atomic start_time; ///< Execution started by a worker service diff --git a/src/replica/worker/CMakeLists.txt b/src/replica/worker/CMakeLists.txt index a37868d82b..7b4d8ff462 100644 --- a/src/replica/worker/CMakeLists.txt +++ b/src/replica/worker/CMakeLists.txt @@ -4,17 +4,29 @@ target_sources(replica_worker PRIVATE FileClient.cc FileServer.cc FileServerConnection.cc + WorkerCreateReplicaHttpRequest.cc + WorkerDeleteReplicaHttpRequest.cc WorkerDeleteRequest.cc + WorkerDirectorIndexHttpRequest.cc WorkerDirectorIndexRequest.cc + WorkerEchoHttpRequest.cc WorkerEchoRequest.cc WorkerFindAllRequest.cc + WorkerFindAllReplicasHttpRequest.cc + WorkerFindReplicaHttpRequest.cc WorkerFindRequest.cc + WorkerHttpProcessor.cc + WorkerHttpProcessorThread.cc + WorkerHttpRequest.cc + WorkerHttpSvc.cc + WorkerHttpSvcMod.cc WorkerProcessor.cc WorkerProcessorThread.cc WorkerReplicationRequest.cc WorkerRequest.cc WorkerServer.cc WorkerServerConnection.cc + WorkerSqlHttpRequest.cc WorkerSqlRequest.cc ) target_link_libraries(replica_worker PUBLIC diff --git a/src/replica/worker/WorkerCreateReplicaHttpRequest.cc b/src/replica/worker/WorkerCreateReplicaHttpRequest.cc new file mode 100644 index 0000000000..582b7c9594 --- /dev/null +++ b/src/replica/worker/WorkerCreateReplicaHttpRequest.cc @@ -0,0 +1,467 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerCreateReplicaHttpRequest.h" + +// System headers +#include +#include +#include +#include + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/proto/Protocol.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/FileUtils.h" +#include "replica/worker/FileClient.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerCreateReplicaHttpRequest", __func__) + +using namespace std; +namespace fs = boost::filesystem; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerCreateReplicaHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerCreateReplicaHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerCreateReplicaHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerCreateReplicaHttpRequest::WorkerCreateReplicaHttpRequest( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, "REPLICATE", hdr, req, onExpired), + _databaseInfo(serviceProvider->config()->databaseInfo(req.at("database"))), + _chunk(req.at("chunk")), + _sourceWorker(req.at("worker")), + _sourceWorkerHost(req.at("worker_host")), + _sourceWorkerPort(req.at("worker_port")), + _sourceWorkerHostPort(_sourceWorkerHost + ":" + to_string(_sourceWorkerPort)), + _initialized(false), + _files(FileUtils::partitionedFiles(_databaseInfo, _chunk)), + _tmpFilePtr(nullptr), + _buf(0), + _bufSize(serviceProvider->config()->get("worker", "fs-buf-size-bytes")) { + if (worker == _sourceWorker) { + throw invalid_argument(CONTEXT + " workers are the same in the request."); + } + if (_sourceWorkerHost.empty()) { + throw invalid_argument(CONTEXT + " the DNS name or an IP address of the worker not provided."); + } +} + +WorkerCreateReplicaHttpRequest::~WorkerCreateReplicaHttpRequest() { + replica::Lock lock(_mtx, CONTEXT); + _releaseResources(lock); +} + +void WorkerCreateReplicaHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["replica_info"] = _replicaInfo.toJson(); +} + +bool WorkerCreateReplicaHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, + CONTEXT << " sourceWorkerHostPort: " << _sourceWorkerHostPort << " database: " << _databaseInfo.name + << " chunk: " << _chunk); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + // Obtain the list of files to be migrated + // + // IMPLEMENTATION NOTES: + // + // - Note using the overloaded operator '/' which is used to form + // folders and files path names below. The operator will concatenate + // names and also insert a file separator for an operating system + // on which this code will get compiled. + // + // - Temporary file names at a destination folders are prepended with + // prefix '_' to prevent colliding with the canonical names. They will + // be renamed in the last step. + // + // - All operations with the file system namespace (creating new non-temporary + // files, checking for folders and files, renaming files, creating folders, etc.) + // are guarded by acquiring replica::Lock lock(_mtxDataFolderOperations) where it's needed. + + WorkerHttpRequest::ErrorContext errorContext; + + /////////////////////////////////////////////////////// + // Initialization phase (runs only once) // + /////////////////////////////////////////////////////// + + if (!_initialized) { + _initialized = true; + + fs::path const outDir = + fs::path(serviceProvider()->config()->get("worker", "data-dir")) / _databaseInfo.name; + + vector tmpFiles; + vector outFiles; + for (auto&& file : _files) { + fs::path const tmpFile = outDir / ("_" + file); + tmpFiles.push_back(tmpFile); + + fs::path const outFile = outDir / file; + outFiles.push_back(outFile); + + _file2descr[file].inSizeBytes = 0; + _file2descr[file].outSizeBytes = 0; + _file2descr[file].mtime = 0; + _file2descr[file].cs = 0; + _file2descr[file].tmpFile = tmpFile; + _file2descr[file].outFile = outFile; + _file2descr[file].beginTransferTime = 0; + _file2descr[file].endTransferTime = 0; + } + + // Check input files, check and sanitize the destination folder + + boost::system::error_code ec; + { + replica::Lock dataFolderLock(_mtxDataFolderOperations, CONTEXT); + + // Check for a presence of input files and calculate space requirement + + uintmax_t totalBytes = 0; // the total number of bytes in all input files to be moved + map file2size; // the number of bytes in each file + + for (auto&& file : _files) { + // Open the file on the remote server in the no-content-read mode + auto const inFilePtr = FileClient::stat(_serviceProvider, _sourceWorkerHost, + _sourceWorkerPort, _databaseInfo.name, file); + errorContext = + errorContext or + reportErrorIf(inFilePtr == nullptr, protocol::StatusExt::FILE_ROPEN, + "failed to open input file on remote worker: " + _sourceWorker + " (" + + _sourceWorkerHostPort + "), database: " + _databaseInfo.name + + ", file: " + file); + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + file2size[file] = inFilePtr->size(); + totalBytes += inFilePtr->size(); + _file2descr[file].inSizeBytes = inFilePtr->size(); + _file2descr[file].mtime = inFilePtr->mtime(); + } + + // Check and sanitize the output directory + + bool const outDirExists = fs::exists(outDir, ec); + errorContext = + errorContext or + reportErrorIf(ec.value() != 0, protocol::StatusExt::FOLDER_STAT, + "failed to check the status of output directory: " + outDir.string()) or + reportErrorIf(!outDirExists, protocol::StatusExt::NO_FOLDER, + "the output directory doesn't exist: " + outDir.string()); + + // The files with canonical(!) names should NOT exist at the destination + // folder. + for (auto&& file : outFiles) { + fs::file_status const stat = fs::status(file, ec); + errorContext = errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FILE_STAT, + "failed to check the status of output file: " + file.string()) or + reportErrorIf(fs::exists(stat), protocol::StatusExt::FILE_EXISTS, + "the output file already exists: " + file.string()); + } + + // Check if there are any files with the temporary names at the destination + // folder and if so then get rid of them. + for (auto&& file : tmpFiles) { + fs::file_status const stat = fs::status(file, ec); + errorContext = + errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FILE_STAT, + "failed to check the status of temporary file: " + file.string()); + if (fs::exists(stat)) { + fs::remove(file, ec); + errorContext = errorContext or + reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_DELETE, + "failed to remove temporary file: " + file.string()); + } + } + + // Make sure a file system at the destination has enough space + // to accommodate new files + // + // NOTE: this operation runs after cleaning up temporary files + fs::space_info const space = fs::space(outDir, ec); + errorContext = + errorContext or + reportErrorIf( + ec.value() != 0, protocol::StatusExt::SPACE_REQ, + "failed to obtaine space information at output folder: " + outDir.string()) or + reportErrorIf(space.available < totalBytes, protocol::StatusExt::NO_SPACE, + "not enough free space availble at output folder: " + outDir.string()); + + // Pre-create temporary files with the final size to assert disk space + // availability before filling these files with the actual payload. + for (auto&& file : _files) { + fs::path const tmpFile = _file2descr[file].tmpFile; + + // Create a file of size 0 + FILE* tmpFilePtr = fopen(tmpFile.string().c_str(), "wb"); + errorContext = errorContext or + reportErrorIf(tmpFilePtr == nullptr, protocol::StatusExt::FILE_CREATE, + "failed to open/create temporary file: " + tmpFile.string() + + ", error: " + strerror(errno)); + if (tmpFilePtr) { + fflush(tmpFilePtr); + fclose(tmpFilePtr); + } + + // Resize the file (will be filled with \0) + fs::resize_file(tmpFile, file2size[file], ec); + errorContext = errorContext or + reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_RESIZE, + "failed to resize the temporary file: " + tmpFile.string()); + } + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + + // Allocate the record buffer + _buf = new uint8_t[_bufSize]; + if (_buf == nullptr) throw runtime_error(CONTEXT + " buffer allocation failed"); + + // Setup the iterator for the name of the very first file to be copied + _fileItr = _files.begin(); + if (!_openFiles(lock)) return true; + } + + // Copy the next record from the currently open remote file + // into the corresponding temporary files at the destination folder + // w/o acquiring the directory lock. + // + // NOTE: the while loop below is meant to skip files which are empty + while (_files.end() != _fileItr) { + // Copy the next record if any is available + size_t num = 0; + try { + num = _inFilePtr->read(_buf, _bufSize); + if (num) { + if (num == fwrite(_buf, sizeof(uint8_t), num, _tmpFilePtr)) { + // Update the descriptor (the number of bytes copied so far + // and the control sum) + _file2descr[*_fileItr].outSizeBytes += num; + uint64_t& cs = _file2descr[*_fileItr].cs; + for (uint8_t *ptr = _buf, *end = _buf + num; ptr != end; ++ptr) { + cs += *ptr; + } + + // Keep updating this stats while copying the files + _file2descr[*_fileItr].endTransferTime = util::TimeUtils::now(); + _updateInfo(lock); + + // Keep copying the same file + return false; + } + errorContext = errorContext or reportErrorIf(true, protocol::StatusExt::FILE_WRITE, + "failed to write into temporary file: " + + _file2descr[*_fileItr].tmpFile.string() + + ", error: " + strerror(errno)); + } + } catch (FileClientError const& ex) { + errorContext = + errorContext or + reportErrorIf(true, protocol::StatusExt::FILE_READ, + "failed to read input file from remote worker: " + _sourceWorker + " (" + + _sourceWorkerHostPort + "), database: " + _databaseInfo.name + + ", file: " + *_fileItr); + } + + // Make sure the number of bytes copied from the remote server + // matches expectations. + errorContext = + errorContext or + reportErrorIf(_file2descr[*_fileItr].inSizeBytes != _file2descr[*_fileItr].outSizeBytes, + protocol::StatusExt::FILE_READ, + "short read of the input file from remote worker: " + _sourceWorker + " (" + + _sourceWorkerHostPort + "), database: " + _databaseInfo.name + + ", file: " + *_fileItr); + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + _releaseResources(lock); + return true; + } + + // Flush and close the current file + fflush(_tmpFilePtr); + fclose(_tmpFilePtr); + _tmpFilePtr = 0; + + // Keep updating this stats after finishing to copy each file + _file2descr[*_fileItr].endTransferTime = util::TimeUtils::now(); + _updateInfo(lock); + + // Move the iterator to the name of the next file to be copied + ++_fileItr; + if (_files.end() != _fileItr) { + if (!_openFiles(lock)) { + _releaseResources(lock); + return true; + } + } + } + + // Finalize the operation, de-allocate resources, etc. + return _finalize(lock); +} + +bool WorkerCreateReplicaHttpRequest::_openFiles(replica::Lock const& lock) { + LOGS(_log, LOG_LVL_DEBUG, + CONTEXT << " sourceWorkerHostPort: " << _sourceWorkerHostPort << " database: " << _databaseInfo.name + << " chunk: " << _chunk << " file: " << *_fileItr); + + WorkerHttpRequest::ErrorContext errorContext; + + // Open the input file on the remote server + _inFilePtr = FileClient::open(_serviceProvider, _sourceWorkerHost, _sourceWorkerPort, _databaseInfo.name, + *_fileItr); + errorContext = errorContext or + reportErrorIf(_inFilePtr == nullptr, protocol::StatusExt::FILE_ROPEN, + "failed to open input file on remote worker: " + _sourceWorker + " (" + + _sourceWorkerHostPort + "), database: " + _databaseInfo.name + + ", file: " + *_fileItr); + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return false; + } + + // Reopen a temporary output file locally in the 'append binary mode' + // then 'rewind' to the beginning of the file before writing into it. + fs::path const tmpFile = _file2descr[*_fileItr].tmpFile; + + _tmpFilePtr = fopen(tmpFile.string().c_str(), "wb"); + errorContext = errorContext or reportErrorIf(_tmpFilePtr == nullptr, protocol::StatusExt::FILE_OPEN, + "failed to open temporary file: " + tmpFile.string() + + ", error: " + strerror(errno)); + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return false; + } + rewind(_tmpFilePtr); + _file2descr[*_fileItr].beginTransferTime = util::TimeUtils::now(); + return true; +} + +bool WorkerCreateReplicaHttpRequest::_finalize(replica::Lock const& lock) { + LOGS(_log, LOG_LVL_DEBUG, + CONTEXT << " sourceWorkerHostPort: " << _sourceWorkerHostPort << " database: " << _databaseInfo.name + << " chunk: " << _chunk); + + // Unconditionally regardless of the completion of the file renaming attempt + _releaseResources(lock); + + // Rename temporary files into the canonical ones + // Note that this operation changes the directory namespace in a way + // which may affect other users (like replica lookup operations, etc.). Hence we're + // acquiring the directory lock to guarantee a consistent view onto the folder. + replica::Lock dataFolderLock(_mtxDataFolderOperations, CONTEXT); + + // ATTENTION: as per ISO/IEC 9945 the file rename operation will + // remove empty files. Not sure if this should be treated + // in a special way? + WorkerHttpRequest::ErrorContext errorContext; + boost::system::error_code ec; + for (auto&& file : _files) { + fs::path const tmpFile = _file2descr[file].tmpFile; + fs::path const outFile = _file2descr[file].outFile; + + fs::rename(tmpFile, outFile, ec); + errorContext = errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_RENAME, + "failed to rename file: " + tmpFile.string()); + fs::last_write_time(outFile, _file2descr[file].mtime, ec); + errorContext = errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_MTIME, + "failed to change 'mtime' of file: " + tmpFile.string()); + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + setStatus(lock, protocol::Status::SUCCESS); + return true; +} + +void WorkerCreateReplicaHttpRequest::_updateInfo(replica::Lock const& lock) { + size_t totalInSizeBytes = 0; + size_t totalOutSizeBytes = 0; + ReplicaInfo::FileInfoCollection fileInfoCollection; + for (auto&& file : _files) { + fileInfoCollection.emplace_back( + ReplicaInfo::FileInfo({file, _file2descr[file].outSizeBytes, _file2descr[file].mtime, + to_string(_file2descr[file].cs), _file2descr[file].beginTransferTime, + _file2descr[file].endTransferTime, _file2descr[file].inSizeBytes})); + totalInSizeBytes += _file2descr[file].inSizeBytes; + totalOutSizeBytes += _file2descr[file].outSizeBytes; + } + ReplicaInfo::Status const status = + (_files.size() == fileInfoCollection.size()) and (totalInSizeBytes == totalOutSizeBytes) + ? ReplicaInfo::Status::COMPLETE + : ReplicaInfo::Status::INCOMPLETE; + + // Fill in the info on the chunk before finishing the operation + WorkerCreateReplicaHttpRequest::_replicaInfo = ReplicaInfo(status, worker(), _databaseInfo.name, _chunk, + util::TimeUtils::now(), fileInfoCollection); +} + +void WorkerCreateReplicaHttpRequest::_releaseResources(replica::Lock const& lock) { + // Drop a connection to the remote server + _inFilePtr.reset(); + + // Close the output file + if (_tmpFilePtr) { + fflush(_tmpFilePtr); + fclose(_tmpFilePtr); + _tmpFilePtr = nullptr; + } + + // Release the record buffer + if (_buf) { + delete[] _buf; + _buf = nullptr; + } +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerCreateReplicaHttpRequest.h b/src/replica/worker/WorkerCreateReplicaHttpRequest.h new file mode 100644 index 0000000000..364a92934c --- /dev/null +++ b/src/replica/worker/WorkerCreateReplicaHttpRequest.h @@ -0,0 +1,186 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERCREATEREPLICAHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERCREATEREPLICAHTTPREQUEST_H + +// System headers +#include +#include +#include +#include +#include +#include + +// Third party headers +#include "boost/filesystem.hpp" +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/util/ReplicaInfo.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class FileClient; +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerCreateReplicaHttpRequest represents a context and a state of replication + * requests within the worker servers. + */ +class WorkerCreateReplicaHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerCreateReplicaHttpRequest() = delete; + WorkerCreateReplicaHttpRequest(WorkerCreateReplicaHttpRequest const&) = delete; + WorkerCreateReplicaHttpRequest& operator=(WorkerCreateReplicaHttpRequest const&) = delete; + + /// Non-trivial destructor is needed to relese resources + ~WorkerCreateReplicaHttpRequest() override; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerCreateReplicaHttpRequest(std::shared_ptr const& serviceProvider, + std::string const& worker, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired); + + /** + * Open files associated with the current state of iterator _fileItr. + * @param lock lock which must be acquired before calling this method + * @return 'false' in case of any error + */ + bool _openFiles(replica::Lock const& lock); + + /** + * The final stage to be executed just once after copying the content + * of the remote files into the local temporary ones. It will rename + * the temporary files into the standard ones. Resources will also be released. + * @param lock A lock to be acquired before calling this method + * @return always 'true' + */ + bool _finalize(replica::Lock const& lock); + + /** + * Close connections, de-allocate resources, etc. + * + * Any connections and open files will be closed, the buffers will be + * released to prevent unnecessary resource utilization. Note that + * request objects can stay in the server's memory for an extended + * period of time. + * @param lock A lock to be acquired before calling this method + */ + void _releaseResources(replica::Lock const& lock); + + /** + * Update file migration statistics + * @param lock A lock to be acquired before calling this method + */ + void _updateInfo(replica::Lock const& lock); + + // Input parameters (extracted from the request object) + + DatabaseInfo const _databaseInfo; ///< Database descriptor obtained from the Configuration + unsigned int const _chunk; + std::string const _sourceWorker; + std::string const _sourceWorkerHost; + uint16_t const _sourceWorkerPort; + std::string const _sourceWorkerHostPort; + std::string const _sourceWorkerDataDir; + + /// Result of the operation + ReplicaInfo _replicaInfo; + + /// The flag indicating if the initialization phase of the operation + /// has already completed + bool _initialized; + + std::vector const _files; ///< Short names of files to be copied + + /// The iterator pointing to the currently processed file. + /// If it's set to _files.end() then it means the operation + /// has finished. + std::vector::const_iterator _fileItr; + + /// This object represents the currently open (if any) input file + /// on the source worker node + std::shared_ptr _inFilePtr; + + std::FILE* _tmpFilePtr; ///< The file pointer for the temporary output file + + /// The FileDescr structure encapsulates various parameters of a file + struct FileDescr { + size_t inSizeBytes = 0; ///< The input file size as reported by a remote server + size_t outSizeBytes = 0; ///< Num. bytes read so far (changes during processing) + std::time_t mtime = 0; ///< The last modification time of the file (sec, UNIX Epoch) + uint64_t cs = 0; ///< Control sum computed locally while copying the file + + boost::filesystem::path tmpFile; /// The absolute path to the temporary file + + /// The final (canonic) file name the temporary file will be renamed as + /// upon a successful completion of the operation. + boost::filesystem::path outFile; + + uint64_t beginTransferTime = 0; ///< When the file transfer started + uint64_t endTransferTime = 0; ///< When the file transfer ended + }; + + /// Cached file descriptions mapping from short file names into + /// the corresponding parameters. + std::map _file2descr; + + uint8_t* _buf; ///< The buffer for storing file payload read from the remote service + size_t _bufSize; ///< The size of the buffer +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERCREATEREPLICAHTTPREQUEST_H diff --git a/src/replica/worker/WorkerDeleteReplicaHttpRequest.cc b/src/replica/worker/WorkerDeleteReplicaHttpRequest.cc new file mode 100644 index 0000000000..065671a3f7 --- /dev/null +++ b/src/replica/worker/WorkerDeleteReplicaHttpRequest.cc @@ -0,0 +1,116 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerDeleteReplicaHttpRequest.h" + +// System headers +#include + +// Third party headers +#include "boost/filesystem.hpp" + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/proto/Protocol.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/FileUtils.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerDeleteReplicaHttpRequest", __func__) + +using namespace std; +namespace fs = boost::filesystem; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerDeleteReplicaHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerDeleteReplicaHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerDeleteReplicaHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerDeleteReplicaHttpRequest::WorkerDeleteReplicaHttpRequest( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, "DELETE", hdr, req, onExpired), + _databaseInfo(serviceProvider->config()->databaseInfo(req.at("database"))), + _chunk(req.at("chunk")), + // This status will be returned in all contexts + _replicaInfo(ReplicaInfo::Status::NOT_FOUND, worker, _databaseInfo.name, _chunk, + util::TimeUtils::now(), ReplicaInfo::FileInfoCollection{}) {} + +void WorkerDeleteReplicaHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["replica_info"] = _replicaInfo.toJson(); +} +bool WorkerDeleteReplicaHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT << " db: " << _databaseInfo.name << " chunk: " << _chunk); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + vector const files = FileUtils::partitionedFiles(_databaseInfo, _chunk); + + // The data folder will be locked while performing the operation + int numFilesDeleted = 0; + WorkerHttpRequest::ErrorContext errorContext; + boost::system::error_code ec; + { + replica::Lock dataFolderLock(_mtxDataFolderOperations, CONTEXT); + fs::path const dataDir = + fs::path(_serviceProvider->config()->get("worker", "data-dir")) / _databaseInfo.name; + fs::file_status const stat = fs::status(dataDir, ec); + errorContext = errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FOLDER_STAT, + "failed to check the status of directory: " + dataDir.string()) or + reportErrorIf(!fs::exists(stat), protocol::StatusExt::NO_FOLDER, + "the directory does not exists: " + dataDir.string()); + for (const auto& name : files) { + const fs::path file = dataDir / fs::path(name); + if (fs::remove(file, ec)) ++numFilesDeleted; + errorContext = errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_DELETE, + "failed to delete file: " + file.string()); + } + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + setStatus(lock, protocol::Status::SUCCESS); + return true; +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerDeleteReplicaHttpRequest.h b/src/replica/worker/WorkerDeleteReplicaHttpRequest.h new file mode 100644 index 0000000000..a862f082c5 --- /dev/null +++ b/src/replica/worker/WorkerDeleteReplicaHttpRequest.h @@ -0,0 +1,99 @@ +// -*- LSST-C++ -*- +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERDELETEREPLICAHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERDELETEREPLICAHTTPREQUEST_H + +// System headers +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/util/ReplicaInfo.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerDeleteReplicaHttpRequest represents a context and a state of replica deletion + * requests within the worker servers. + */ +class WorkerDeleteReplicaHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerDeleteReplicaHttpRequest() = delete; + WorkerDeleteReplicaHttpRequest(WorkerDeleteReplicaHttpRequest const&) = delete; + WorkerDeleteReplicaHttpRequest& operator=(WorkerDeleteReplicaHttpRequest const&) = delete; + + ~WorkerDeleteReplicaHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerDeleteReplicaHttpRequest(std::shared_ptr const& serviceProvider, + std::string const& worker, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired); + + // Input parameters + DatabaseInfo const _databaseInfo; ///< Database descriptor obtained from the Configuration + unsigned int _chunk; + + /// Extended status of the replica deletion request + ReplicaInfo _replicaInfo; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERDELETEREPLICAHTTPREQUEST_H diff --git a/src/replica/worker/WorkerDirectorIndexHttpRequest.cc b/src/replica/worker/WorkerDirectorIndexHttpRequest.cc new file mode 100644 index 0000000000..fec8eeec24 --- /dev/null +++ b/src/replica/worker/WorkerDirectorIndexHttpRequest.cc @@ -0,0 +1,292 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerDirectorIndexHttpRequest.h" + +// System headers +#include +#include +#include +#include + +// Third party headers +#include "boost/filesystem.hpp" + +// Qserv headers +#include "global/constants.h" +#include "replica/config/Configuration.h" +#include "replica/mysql/DatabaseMySQL.h" +#include "replica/proto/Protocol.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/Performance.h" +#include "util/String.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerDirectorIndexHttpRequest", __func__) + +using namespace std; +namespace fs = boost::filesystem; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerDirectorIndexHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +using namespace database::mysql; + +shared_ptr WorkerDirectorIndexHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired, + shared_ptr const& connectionPool) { + auto ptr = shared_ptr( + new WorkerDirectorIndexHttpRequest(serviceProvider, worker, hdr, req, onExpired, connectionPool)); + ptr->init(); + return ptr; +} + +WorkerDirectorIndexHttpRequest::WorkerDirectorIndexHttpRequest( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired, + shared_ptr const& connectionPool) + : WorkerHttpRequest(serviceProvider, worker, "INDEX", hdr, req, onExpired), + _databaseInfo(serviceProvider->config()->databaseInfo(req.at("database"))), + _tableInfo(_databaseInfo.findTable(req.at("director_table"))), + _hasTransactions(req.at("has_transaction")), + _transactionId(req.at("transaction_id")), + _chunk(req.at("chunk")), + _offset(req.at("offset")), + _connectionPool(connectionPool), + _tmpDirName(serviceProvider->config()->get("worker", "loader-tmp-dir") + "/" + + _databaseInfo.name), + _fileName(_tmpDirName + "/" + _tableInfo.name + "-" + to_string(_chunk) + + (_hasTransactions ? "-p" + to_string(_transactionId) : "") + "-" + hdr.id) {} + +void WorkerDirectorIndexHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["error"] = _error; + result["data"] = util::String::toHex(_data.data(), _data.size()); + result["total_bytes"] = _fileSizeBytes; +} + +bool WorkerDirectorIndexHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + try { + // The table will be scanned only when the offset is set to 0. + if (_offset == 0) { + // Create a folder (if it still doesn't exist) where the temporary files will be placed + // NOTE: this folder is supposed to be seen by the worker's MySQL/MariaDB server, and it + // must be write-enabled for an account under which the service is run. + boost::system::error_code ec; + fs::create_directory(fs::path(_tmpDirName), ec); + if (ec.value() != 0) { + _error = "failed to create folder '" + _tmpDirName; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " " << _error); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::FOLDER_CREATE); + } + + // Make sure no file exists from any previous attempt to harvest the index data + // in a scope of the request. Otherwise MySQL query will fail. + _removeFile(); + + // Connect to the worker database + // Manage the new connection via the RAII-style handler to ensure the transaction + // is automatically rolled-back in case of exceptions. + ConnectionHandler const h(_connectionPool); + + // A scope of the query depends on parameters of the request + h.conn->executeInOwnTransaction([self = shared_from_base()]( + auto conn) { conn->execute(self->_query(conn)); }); + } + if (auto const status = _readFile(_offset); status != protocol::StatusExt::NONE) { + setStatus(lock, protocol::Status::FAILED, status); + } else { + setStatus(lock, protocol::Status::SUCCESS); + } + } catch (ER_NO_SUCH_TABLE_ const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " MySQL error: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::NO_SUCH_TABLE); + } catch (database::mysql::ER_PARTITION_MGMT_ON_NONPARTITIONED_ const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " MySQL error: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::NOT_PARTITIONED_TABLE); + } catch (database::mysql::ER_UNKNOWN_PARTITION_ const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " MySQL error: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::NO_SUCH_PARTITION); + } catch (database::mysql::Error const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " MySQL error: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::MYSQL_ERROR); + } catch (invalid_argument const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " exception: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::INVALID_PARAM); + } catch (out_of_range const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " exception: " << ex.what()); + _error = ex.what(); + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::LARGE_RESULT); + } catch (exception const& ex) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " exception: " << ex.what()); + _error = "Exception: " + string(ex.what()); + setStatus(lock, protocol::Status::FAILED); + } + return true; +} + +string WorkerDirectorIndexHttpRequest::_query(shared_ptr const& conn) const { + if (!_tableInfo.isDirector()) { + throw invalid_argument("table '" + _tableInfo.name + + "' is not been configured as director in database '" + _databaseInfo.name + + "'"); + } + if (_tableInfo.directorTable.primaryKeyColumn().empty()) { + throw invalid_argument("director table '" + _tableInfo.name + + "' has not been properly configured in database '" + _databaseInfo.name + "'"); + } + if (_tableInfo.columns.empty()) { + throw invalid_argument("no schema found for director table '" + _tableInfo.name + "' of database '" + + _databaseInfo.name + "'"); + } + + // Find types required by the "director" index table's columns + + string const qservTransId = _hasTransactions ? "qserv_trans_id" : string(); + string qservTransIdType; + string primaryKeyColumnType; + string subChunkIdColNameType; + + for (auto&& column : _tableInfo.columns) { + if (!qservTransId.empty() && column.name == qservTransId) + qservTransIdType = column.type; + else if (column.name == _tableInfo.directorTable.primaryKeyColumn()) + primaryKeyColumnType = column.type; + else if (column.name == lsst::qserv::SUB_CHUNK_COLUMN) + subChunkIdColNameType = column.type; + } + if ((!qservTransId.empty() && qservTransIdType.empty()) || primaryKeyColumnType.empty() or + subChunkIdColNameType.empty()) { + throw invalid_argument( + "column definitions for the Object identifier or sub-chunk identifier" + " columns are missing in the director table schema for table '" + + _tableInfo.name + "' of database '" + _databaseInfo.name + "'"); + } + + // NOTE: injecting the chunk number into each row of the result set because + // the chunk-id column is optional. + QueryGenerator const g(conn); + DoNotProcess const chunk = g.val(_chunk); + SqlId const sqlTableId = g.id(_databaseInfo.name, _tableInfo.name + "_" + to_string(_chunk)); + string query; + if (qservTransId.empty()) { + query = g.select(_tableInfo.directorTable.primaryKeyColumn(), chunk, lsst::qserv::SUB_CHUNK_COLUMN) + + g.from(sqlTableId) + g.orderBy(make_pair(_tableInfo.directorTable.primaryKeyColumn(), "")); + } else { + query = g.select(qservTransId, _tableInfo.directorTable.primaryKeyColumn(), chunk, + lsst::qserv::SUB_CHUNK_COLUMN) + + g.from(sqlTableId) + g.inPartition(g.partId(_transactionId)) + + g.orderBy(make_pair(qservTransId, ""), + make_pair(_tableInfo.directorTable.primaryKeyColumn(), "")); + } + return query + g.intoOutfile(_fileName); +} + +protocol::StatusExt WorkerDirectorIndexHttpRequest::_readFile(size_t offset) { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT); + + // Open the the file. + ifstream f(_fileName, ios::binary); + if (!f.good()) { + _error = "failed to open file '" + _fileName + "'"; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " " << _error); + return protocol::StatusExt::FILE_ROPEN; + } + + // Get the file size. + boost::system::error_code ec; + _fileSizeBytes = fs::file_size(_fileName, ec); + if (ec.value() != 0) { + _error = "failed to get file size '" + _fileName + "'"; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " " << _error); + return protocol::StatusExt::FILE_SIZE; + } + + // Validate a value of the offset and position indicator as requested. + if (offset == _fileSizeBytes) { + _removeFile(); + return protocol::StatusExt::NONE; + } else if (offset > _fileSizeBytes) { + _error = "attempted to read the file '" + _fileName + "' at the offset " + to_string(offset) + + " that is beyond the file size of " + to_string(_fileSizeBytes) + " bytes."; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " " << _error); + return protocol::StatusExt::INVALID_PARAM; + } else if (offset != 0) { + f.seekg(offset, ios::beg); + } + + // Resize the memory buffer for the efficiency of the following read. + size_t const recordSize = + std::min(_fileSizeBytes - offset, + serviceProvider()->config()->get("worker", "director-index-record-size")); + _data.resize(recordSize, ' '); + + // Read the specified number of bytes into the buffer. + protocol::StatusExt result = protocol::StatusExt::NONE; + f.read(&_data[0], recordSize); + if (f.bad()) { + _error = "failed to read " + to_string(recordSize) + " bytes from the file '" + _fileName + + "' at the offset " + to_string(offset) + "."; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " " << _error); + result = protocol::StatusExt::FILE_READ; + } + f.close(); + + // If this was the last record read from the file then delete the file. + if (offset + recordSize >= _fileSizeBytes) { + _removeFile(); + } + return result; +} + +void WorkerDirectorIndexHttpRequest::_removeFile() const { + // Make the best attempt to get rid of the temporary file. Ignore any errors + // for now. Just report them. Note that 'remove_all' won't complain if the file + // didn't exist. + boost::system::error_code ec; + fs::remove_all(fs::path(_fileName), ec); + if (ec.value() != 0) { + LOGS(_log, LOG_LVL_WARN, CONTEXT << " failed to remove the temporary file '" << _fileName); + } +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerDirectorIndexHttpRequest.h b/src/replica/worker/WorkerDirectorIndexHttpRequest.h new file mode 100644 index 0000000000..1aa26715f7 --- /dev/null +++ b/src/replica/worker/WorkerDirectorIndexHttpRequest.h @@ -0,0 +1,149 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERDIRECTORINDEXHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERDIRECTORINDEXHTTPREQUEST_H + +// System headers +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/proto/Protocol.h" +#include "replica/util/Common.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::database::mysql { +class Connection; +class ConnectionPool; +} // namespace lsst::qserv::replica::database::mysql + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerDirectorIndexHttpRequest queries a director table (the whole or just one MySQL + * partition, depending on parameters of the request) of a database + * to extracts data to be loaded into the "director" index. + */ +class WorkerDirectorIndexHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @param connectionPool a pool of connections to the MySQL/MariaDB server + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired, + std::shared_ptr const& connectionPool); + + WorkerDirectorIndexHttpRequest() = delete; + WorkerDirectorIndexHttpRequest(WorkerDirectorIndexHttpRequest const&) = delete; + WorkerDirectorIndexHttpRequest& operator=(WorkerDirectorIndexHttpRequest const&) = delete; + + ~WorkerDirectorIndexHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerDirectorIndexHttpRequest(std::shared_ptr const& serviceProvider, + std::string const& worker, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired, + std::shared_ptr const& connectionPool); + + /** + * The query generator uses parameters of a request to compose + * a desired query. + * + * @param conn a reference to the database connector is needed to process + * arguments to meet requirements of the database query processing engine. + * @return a query as per the input request + * @throws std::invalid_argument if the input parameters are not supported + */ + std::string _query(std::shared_ptr const& conn) const; + + /** + * Read the content of the file into memory starting from the given offset. + * @note The maximum number of bytes to read is set in the Configuration + * parameter (worker, director-index-record-size). + * @param offset A position of the first byte in the file to read. + * @return The completion status to be returned to the Controller. + */ + protocol::StatusExt _readFile(size_t offset); + + /// Get rid of the temporary file if it's still tehre. + void _removeFile() const; + + // Input parameters + DatabaseInfo const _databaseInfo; ///< Database descriptor obtained from the Configuration + TableInfo const _tableInfo; ///< Director table descriptor obtained from the Configuration + bool const _hasTransactions; + TransactionId const _transactionId; + unsigned int const _chunk; + std::size_t const _offset; + std::shared_ptr const _connectionPool; + + /// The path name of a temporary folder where the file will be stored. + /// The folder gets created before extracting data from the MySQL table + /// into the file. + std::string const _tmpDirName; + + /// The full path name of a temporary file into which the TSV/CSV dump will be made. + /// This file will get deleted when its whole content is sent to the Controller. + std::string const _fileName; + + /// The size of the file is determined each time before reading it. + size_t _fileSizeBytes = 0; + + /// Cached error to be sent to a client + std::string _error; + + /// In-memory storage for the content of the file upon a successful completion + /// of the data extraction query. + std::string _data; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERDIRECTORINDEXHTTPREQUEST_H diff --git a/src/replica/worker/WorkerEchoHttpRequest.cc b/src/replica/worker/WorkerEchoHttpRequest.cc new file mode 100644 index 0000000000..9a699d2590 --- /dev/null +++ b/src/replica/worker/WorkerEchoHttpRequest.cc @@ -0,0 +1,97 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerEchoHttpRequest.h" + +// System headers +#include +#include + +// Qserv headers +#include "util/BlockPost.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerEchoHttpRequest", __func__) + +using namespace std; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerEchoHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerEchoHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerEchoHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerEchoHttpRequest::WorkerEchoHttpRequest(shared_ptr const& serviceProvider, + string const& worker, protocol::QueuedRequestHdr const& hdr, + json const& req, ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, "TEST_ECHO", hdr, req, onExpired), + _delay(req.at("delay")), + _data(req.at("data")), + _delayLeft(_delay) { + if (_delay < 0) { + throw invalid_argument(CONTEXT + " invalid delay[ms]: " + to_string(_delay)); + } +} + +void WorkerEchoHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["data"] = _data; +} + +bool WorkerEchoHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT << " delay[ms]: " << _delayLeft << " / " << _delay); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + // Block the thread for the random number of milliseconds in the interval + // below. Then update the amount of time which is still left. + // The delay is in the range of [0..1] through [0..1000] milliseconds depending + // on the amount of time which is still left. + util::BlockPost blockPost(0, max(1, min(1000, _delayLeft))); + int const span = blockPost.wait(); + _delayLeft -= (span < _delayLeft) ? span : _delayLeft; + + // Done if have reached or exceeded the initial delay + if (0 == _delayLeft) { + setStatus(lock, protocol::Status::SUCCESS); + return true; + } + return false; +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerEchoHttpRequest.h b/src/replica/worker/WorkerEchoHttpRequest.h new file mode 100644 index 0000000000..ef9b7be931 --- /dev/null +++ b/src/replica/worker/WorkerEchoHttpRequest.h @@ -0,0 +1,99 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERECHOHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERECHOHTTPREQUEST_H + +// System headers +#include +#include +#include + +// Qserv headers +#include "replica/worker/WorkerHttpRequest.h" + +// Third party headers +#include "nlohmann/json.hpp" + +// Forward declarations + +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerEchoHttpRequest implements test requests within the worker servers. + * Requests of this type don't have any side effects (in terms of modifying + * any files or databases). + */ +class WorkerEchoHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerEchoHttpRequest() = delete; + WorkerEchoHttpRequest(WorkerEchoHttpRequest const&) = delete; + WorkerEchoHttpRequest& operator=(WorkerEchoHttpRequest const&) = delete; + + ~WorkerEchoHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerEchoHttpRequest(std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + // Input parameters (extracted from the request object) + int const _delay; ///< The amount of the initial delay (milliseconds) + std::string const _data; ///< The message to be echoed back to the client + + int _delayLeft; ///< The amount of the initial delay which is still left (milliseconds) +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERECHOHTTPREQUEST_H diff --git a/src/replica/worker/WorkerFindAllReplicasHttpRequest.cc b/src/replica/worker/WorkerFindAllReplicasHttpRequest.cc new file mode 100644 index 0000000000..85094bdda1 --- /dev/null +++ b/src/replica/worker/WorkerFindAllReplicasHttpRequest.cc @@ -0,0 +1,157 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerFindAllReplicasHttpRequest.h" + +// System headers +#include + +// Third party headers +#include "boost/filesystem.hpp" + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/proto/Protocol.h" +#include "replica/util/FileUtils.h" +#include "replica/services/ServiceProvider.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerFindAllReplicasHttpRequest", __func__) + +using namespace std; +namespace fs = boost::filesystem; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerFindAllReplicasHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerFindAllReplicasHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerFindAllReplicasHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerFindAllReplicasHttpRequest::WorkerFindAllReplicasHttpRequest( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, "FIND-ALL", hdr, req, onExpired), + _database(req.at("database")), + _databaseInfo(serviceProvider->config()->databaseInfo(_database)) {} + +void WorkerFindAllReplicasHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["replica_info_many"] = json::array(); + for (auto const& replicaInfo : _replicaInfoCollection) { + result["replica_info_many"].push_back(replicaInfo.toJson()); + } +} + +bool WorkerFindAllReplicasHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT << " database: " << _databaseInfo.name); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + // Scan the data directory to find all files which match the expected pattern(s) + // and group them by their chunk number + WorkerHttpRequest::ErrorContext errorContext; + boost::system::error_code ec; + + map chunk2fileInfoCollection; + { + replica::Lock dataFolderLock(_mtxDataFolderOperations, CONTEXT); + fs::path const dataDir = + fs::path(_serviceProvider->config()->get("worker", "data-dir")) / _databaseInfo.name; + fs::file_status const stat = fs::status(dataDir, ec); + errorContext = errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FOLDER_STAT, + "failed to check the status of directory: " + dataDir.string()) or + reportErrorIf(!fs::exists(stat), protocol::StatusExt::NO_FOLDER, + "the directory does not exists: " + dataDir.string()); + try { + for (fs::directory_entry& entry : fs::directory_iterator(dataDir)) { + tuple parsed; + if (FileUtils::parsePartitionedFile(parsed, entry.path().filename().string(), + _databaseInfo)) { + LOGS(_log, LOG_LVL_DEBUG, + CONTEXT << " database: " << _databaseInfo.name + << " file: " << entry.path().filename() << " table: " << get<0>(parsed) + << " chunk: " << get<1>(parsed) << " ext: " << get<2>(parsed)); + + uint64_t const size = fs::file_size(entry.path(), ec); + errorContext = errorContext or + reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_SIZE, + "failed to read file size: " + entry.path().string()); + + time_t const mtime = fs::last_write_time(entry.path(), ec); + errorContext = errorContext or + reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_MTIME, + "failed to read file mtime: " + entry.path().string()); + + unsigned const chunk = get<1>(parsed); + chunk2fileInfoCollection[chunk].emplace_back(ReplicaInfo::FileInfo({ + entry.path().filename().string(), size, mtime, + "", /* cs is never computed for this type of requests */ + 0, /* beginTransferTime */ + 0, /* endTransferTime */ + size /* inSize */ + })); + } + } + } catch (fs::filesystem_error const& ex) { + errorContext = errorContext or reportErrorIf(true, protocol::StatusExt::FOLDER_READ, + "failed to read the directory: " + dataDir.string() + + ", error: " + string(ex.what())); + } + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + + // Analyze results to see which chunks are complete using chunk 0 as an example + // of the total number of files which are normally associated with each chunk. + size_t const numFilesPerChunkRequired = FileUtils::partitionedFiles(_databaseInfo, 0).size(); + for (auto&& entry : chunk2fileInfoCollection) { + unsigned int const chunk = entry.first; + size_t const numFiles = entry.second.size(); + _replicaInfoCollection.emplace_back( + numFiles < numFilesPerChunkRequired ? ReplicaInfo::INCOMPLETE : ReplicaInfo::COMPLETE, + worker(), _databaseInfo.name, chunk, util::TimeUtils::now(), chunk2fileInfoCollection[chunk]); + } + setStatus(lock, protocol::Status::SUCCESS); + return true; +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerFindAllReplicasHttpRequest.h b/src/replica/worker/WorkerFindAllReplicasHttpRequest.h new file mode 100644 index 0000000000..649cdfc822 --- /dev/null +++ b/src/replica/worker/WorkerFindAllReplicasHttpRequest.h @@ -0,0 +1,101 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERFINDALLREPLICASHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERFINDALLREPLICASHTTPREQUEST_H + +// System headers +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/util/ReplicaInfo.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerFindAllReplicasHttpRequest represents a context and a state of replicas lookup + * requests within the worker servers. + */ +class WorkerFindAllReplicasHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerFindAllReplicasHttpRequest() = delete; + WorkerFindAllReplicasHttpRequest(WorkerFindAllReplicasHttpRequest const&) = delete; + WorkerFindAllReplicasHttpRequest& operator=(WorkerFindAllReplicasHttpRequest const&) = delete; + + ~WorkerFindAllReplicasHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerFindAllReplicasHttpRequest(std::shared_ptr const& serviceProvider, + std::string const& worker, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired); + + // Input parameters + std::string const _database; + + /// Cached descriptor of the database obtained from the Configuration + DatabaseInfo const _databaseInfo; + + /// Result of the operation + ReplicaInfoCollection _replicaInfoCollection; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERFINDALLREPLICASHTTPREQUEST_H diff --git a/src/replica/worker/WorkerFindReplicaHttpRequest.cc b/src/replica/worker/WorkerFindReplicaHttpRequest.cc new file mode 100644 index 0000000000..c4e2d728b1 --- /dev/null +++ b/src/replica/worker/WorkerFindReplicaHttpRequest.cc @@ -0,0 +1,233 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerFindReplicaHttpRequest.h" + +// System headers + +// Third party headers +#include "boost/filesystem.hpp" + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/FileUtils.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerFindReplicaHttpRequest", __func__) + +using namespace std; +namespace fs = boost::filesystem; +using json = nlohmann::json; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerFindReplicaHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerFindReplicaHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerFindReplicaHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerFindReplicaHttpRequest::WorkerFindReplicaHttpRequest(shared_ptr const& serviceProvider, + string const& worker, + protocol::QueuedRequestHdr const& hdr, + json const& req, + ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, "FIND", hdr, req, onExpired), + _databaseInfo(serviceProvider->config()->databaseInfo(req.at("database"))), + _chunk(req.at("chunk")), + _computeCheckSum(req.at("compute_cs")) {} + +void WorkerFindReplicaHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["replica_info"] = _replicaInfo.toJson(); +} + +bool WorkerFindReplicaHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT << " database: " << _databaseInfo.name << " chunk: " << _chunk); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, CONTEXT); + + // There are two modes of operation of the code which would depend + // on a presence (or a lack of that) to calculate control/check sums + // for the found files. + // + // - if the control/check sum is NOT requested then the request will + // be executed immediately within this call. + // + // - otherwise the incremental approach will be used (which will require + // setting up the incremental engine if this is the first call to the method) + // + // Both methods are combined within the same code block to avoid + // code duplication. + WorkerHttpRequest::ErrorContext errorContext; + boost::system::error_code ec; + + if (!_computeCheckSum or (_csComputeEnginePtr == nullptr)) { + // Check if the data directory exists and it can be read + replica::Lock dataFolderLock(_mtxDataFolderOperations, CONTEXT); + fs::path const dataDir = + fs::path(_serviceProvider->config()->get("worker", "data-dir")) / _databaseInfo.name; + fs::file_status const stat = fs::status(dataDir, ec); + errorContext = errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FOLDER_STAT, + "failed to check the status of directory: " + dataDir.string()) or + reportErrorIf(!fs::exists(stat), protocol::StatusExt::NO_FOLDER, + "the directory does not exists: " + dataDir.string()); + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + + // For each file associated with the chunk check if the file is present in + // the data directory. + // + // - not finding a file is not a failure for this operation. Just reporting + // those files which are present. + // + // - assume the request failure for any file system operation failure + // + // - assume the successful completion otherwise and adjust the replica + // information record accordingly, depending on the findings. + ReplicaInfo::FileInfoCollection + fileInfoCollection; // file info if not using the incremental processing + vector files; // file paths registered for the incremental processing + + for (auto&& file : FileUtils::partitionedFiles(_databaseInfo, _chunk)) { + fs::path const path = dataDir / file; + fs::file_status const stat = fs::status(path, ec); + errorContext = errorContext or + reportErrorIf(stat.type() == fs::status_error, protocol::StatusExt::FILE_STAT, + "failed to check the status of file: " + path.string()); + if (fs::exists(stat)) { + if (!_computeCheckSum) { + // Get file size & mtime right away + uint64_t const size = fs::file_size(path, ec); + errorContext = + errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_SIZE, + "failed to read file size: " + path.string()); + const time_t mtime = fs::last_write_time(path, ec); + errorContext = + errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_MTIME, + "failed to read file mtime: " + path.string()); + fileInfoCollection.emplace_back(ReplicaInfo::FileInfo({ + file, size, mtime, "", /* cs */ + 0, /* beginTransferTime */ + 0, /* endTransferTime */ + size /* inSize */ + })); + } else { + // Register this file for the incremental processing + files.push_back(path.string()); + } + } + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + + // If that's so then finalize the operation right away + if (!_computeCheckSum) { + ReplicaInfo::Status status = ReplicaInfo::Status::NOT_FOUND; + if (fileInfoCollection.size()) + status = + FileUtils::partitionedFiles(_databaseInfo, _chunk).size() == fileInfoCollection.size() + ? ReplicaInfo::Status::COMPLETE + : ReplicaInfo::Status::INCOMPLETE; + + // Fill in the info on the chunk before finishing the operation + _replicaInfo = ReplicaInfo(status, worker(), _databaseInfo.name, _chunk, util::TimeUtils::now(), + fileInfoCollection); + setStatus(lock, protocol::Status::SUCCESS); + return true; + } + + // Otherwise proceed with the incremental approach + _csComputeEnginePtr.reset(new MultiFileCsComputeEngine(files)); + } + + // Next (or the first) iteration in the incremental approach + bool finished = true; + try { + finished = _csComputeEnginePtr->execute(); + if (finished) { + // Extract statistics + ReplicaInfo::FileInfoCollection fileInfoCollection; + auto const fileNames = _csComputeEnginePtr->fileNames(); + for (auto&& file : fileNames) { + const fs::path path(file); + uint64_t const size = _csComputeEnginePtr->bytes(file); + time_t const mtime = fs::last_write_time(path, ec); + errorContext = errorContext or reportErrorIf(ec.value() != 0, protocol::StatusExt::FILE_MTIME, + "failed to read file mtime: " + path.string()); + fileInfoCollection.emplace_back(ReplicaInfo::FileInfo({ + path.filename().string(), size, mtime, to_string(_csComputeEnginePtr->cs(file)), + 0, /* beginTransferTime */ + 0, /* endTransferTime */ + size /* inSize */ + })); + } + if (errorContext.failed) { + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + return true; + } + + // Fnalize the operation + ReplicaInfo::Status status = ReplicaInfo::Status::NOT_FOUND; + if (fileInfoCollection.size()) + status = FileUtils::partitionedFiles(_databaseInfo, _chunk).size() == fileNames.size() + ? ReplicaInfo::Status::COMPLETE + : ReplicaInfo::Status::INCOMPLETE; + + // Fill in the info on the chunk before finishing the operation + _replicaInfo = ReplicaInfo(status, worker(), _databaseInfo.name, _chunk, util::TimeUtils::now(), + fileInfoCollection); + setStatus(lock, protocol::Status::SUCCESS); + } + } catch (exception const& ex) { + WorkerHttpRequest::ErrorContext errorContext; + errorContext = errorContext or reportErrorIf(true, protocol::StatusExt::FILE_READ, ex.what()); + setStatus(lock, protocol::Status::FAILED, errorContext.extendedStatus); + } + + // If done (either way) then get rid of the engine right away because + // it may still have allocated buffers + if (finished) _csComputeEnginePtr.reset(); + return finished; +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerFindReplicaHttpRequest.h b/src/replica/worker/WorkerFindReplicaHttpRequest.h new file mode 100644 index 0000000000..a0ad3c12f1 --- /dev/null +++ b/src/replica/worker/WorkerFindReplicaHttpRequest.h @@ -0,0 +1,104 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERFINDREPLICAHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERFINDREPLICAHTTPREQUEST_H + +// System headers +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/util/ReplicaInfo.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class MultiFileCsComputeEngine; +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerFindReplicaHttpRequest represents a context and a state of replica lookup + * requests within the worker servers. + */ +class WorkerFindReplicaHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerFindReplicaHttpRequest() = delete; + WorkerFindReplicaHttpRequest(WorkerFindReplicaHttpRequest const&) = delete; + WorkerFindReplicaHttpRequest& operator=(WorkerFindReplicaHttpRequest const&) = delete; + + ~WorkerFindReplicaHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerFindReplicaHttpRequest(std::shared_ptr const& serviceProvider, + std::string const& worker, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired); + + // Input parameters + DatabaseInfo const _databaseInfo; ///< Database descriptor obtained from the Configuration + unsigned int _chunk; + bool const _computeCheckSum; + + /// Result of the operation + ReplicaInfo _replicaInfo; + + /// The engine for incremental control sum calculation + std::unique_ptr _csComputeEnginePtr; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERFINDREPLICAHTTPREQUEST_H diff --git a/src/replica/worker/WorkerHttpProcessor.cc b/src/replica/worker/WorkerHttpProcessor.cc new file mode 100644 index 0000000000..3952126378 --- /dev/null +++ b/src/replica/worker/WorkerHttpProcessor.cc @@ -0,0 +1,576 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerHttpProcessor.h" + +// System headers +#include +#include +#include + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/mysql/DatabaseMySQL.h" +#include "replica/services/ServiceProvider.h" +#include "replica/worker/WorkerHttpProcessorThread.h" +#include "replica/worker/WorkerHttpRequest.h" +#include "replica/worker/WorkerCreateReplicaHttpRequest.h" +#include "replica/worker/WorkerDeleteReplicaHttpRequest.h" +#include "replica/worker/WorkerDirectorIndexHttpRequest.h" +#include "replica/worker/WorkerEchoHttpRequest.h" +#include "replica/worker/WorkerFindReplicaHttpRequest.h" +#include "replica/worker/WorkerFindAllReplicasHttpRequest.h" +#include "replica/worker/WorkerSqlHttpRequest.h" +#include "util/BlockPost.h" +#include "util/TimeUtils.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; +using namespace std::placeholders; +using json = nlohmann::json; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerHttpProcessor"); +} // namespace + +namespace lsst::qserv::replica { + +bool WorkerHttpProcessor::PriorityQueueType::remove(string const& id) { + auto itr = find_if(c.begin(), c.end(), + [&id](shared_ptr const& ptr) { return ptr->id() == id; }); + if (itr != c.end()) { + c.erase(itr); + make_heap(c.begin(), c.end(), comp); + return true; + } + return false; +} + +shared_ptr WorkerHttpProcessor::create( + shared_ptr const& serviceProvider, string const& worker) { + return shared_ptr(new WorkerHttpProcessor(serviceProvider, worker)); +} + +WorkerHttpProcessor::WorkerHttpProcessor(shared_ptr const& serviceProvider, + string const& worker) + : _serviceProvider(serviceProvider), + _worker(worker), + _connectionPool(database::mysql::ConnectionPool::create( + Configuration::qservWorkerDbParams(), + serviceProvider->config()->get("database", "services-pool-size"))), + _state(protocol::ServiceState::SUSPENDED), + _startTime(util::TimeUtils::now()) {} + +void WorkerHttpProcessor::run() { + LOGS(_log, LOG_LVL_DEBUG, _context(__func__)); + replica::Lock lock(_mtx, _context(__func__)); + + if (_state == protocol::ServiceState::SUSPENDED) { + size_t const numThreads = + _serviceProvider->config()->get("worker", "num-svc-processing-threads"); + if (numThreads == 0) { + throw out_of_range(_classMethodContext(__func__) + + "invalid configuration parameter for the number of processing threads. " + "The value of the parameter must be greater than 0"); + } + + // Create threads if needed + if (_threads.empty()) { + auto const self = shared_from_this(); + for (size_t i = 0; i < numThreads; ++i) { + _threads.push_back(WorkerHttpProcessorThread::create(self)); + } + } + + // Tell each thread to run + for (auto&& t : _threads) { + t->run(); + } + _state = protocol::ServiceState::RUNNING; + } +} + +void WorkerHttpProcessor::stop() { + LOGS(_log, LOG_LVL_DEBUG, _context(__func__)); + replica::Lock lock(_mtx, _context(__func__)); + + if (_state == protocol::ServiceState::RUNNING) { + // Tell each thread to stop. + for (auto&& t : _threads) { + t->stop(); + } + + // Begin transitioning to the final state via this intermediate one. + // The transition will finish asynchronous when all threads will report + // desired changes in their states. + _state = protocol::ServiceState::SUSPEND_IN_PROGRESS; + } +} + +void WorkerHttpProcessor::drain() { + LOGS(_log, LOG_LVL_DEBUG, _context(__func__)); + replica::Lock lock(_mtx, _context(__func__)); + + // Collect identifiers of requests to be affected by the operation + list ids; + for (auto&& ptr : _newRequests) ids.push_back(ptr->id()); + for (auto&& entry : _inProgressRequests) ids.push_back(entry.first); + for (auto&& id : ids) _stopRequestImpl(lock, id); +} + +void WorkerHttpProcessor::reconfig() { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context); + replica::Lock lock(_mtx, context); + _serviceProvider->config()->reload(); +} + +json WorkerHttpProcessor::createReplica(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::deleteReplica(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::findReplica(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::findAllReplicas(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::echo(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::sql(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req); +} + +json WorkerHttpProcessor::index(protocol::QueuedRequestHdr const& hdr, json const& req) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << hdr.id); + return _submit(replica::Lock(_mtx, context), context, hdr, req, + _connectionPool); +} + +json WorkerHttpProcessor::requestStatus(string const& id) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << id); + + replica::Lock lock(_mtx, context); + + // Still waiting in the queue? + shared_ptr targetRequestPtr; + for (auto ptr : _newRequests) { + if (ptr->id() == id) { + targetRequestPtr = ptr; + break; + } + } + if (targetRequestPtr == nullptr) { + // Is it already being processed? + auto itrInProgress = _inProgressRequests.find(id); + if (itrInProgress != _inProgressRequests.end()) { + targetRequestPtr = itrInProgress->second; + } + if (targetRequestPtr == nullptr) { + // Has it finished? + auto itrFinished = _finishedRequests.find(id); + if (itrFinished != _finishedRequests.end()) { + targetRequestPtr = itrFinished->second; + } + // No such request? + if (targetRequestPtr == nullptr) { + return json::object( + {{"status", protocol::Status::BAD}, + {"status_str", protocol::toString(protocol::Status::BAD)}, + {"status_ext", protocol::StatusExt::INVALID_ID}, + {"status_ext_str", protocol::toString(protocol::StatusExt::INVALID_ID)}}); + } + } + } + return targetRequestPtr->toJson(); +} + +json WorkerHttpProcessor::stopRequest(string const& id) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << id); + + replica::Lock lock(_mtx, context); + + json response = json::object(); + auto const request = _stopRequestImpl(lock, id); + if (request == nullptr) { + response["status"] = protocol::Status::BAD; + response["status_str"] = protocol::toString(protocol::Status::BAD); + response["status_ext"] = protocol::StatusExt::INVALID_ID; + response["status_ext_str"] = protocol::toString(protocol::StatusExt::INVALID_ID); + } else { + response = request->toJson(); + } + return response; +} + +json WorkerHttpProcessor::trackRequest(string const& id) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << id); + + replica::Lock lock(_mtx, context); + + json response = json::object(); + auto const request = _trackRequestImpl(lock, id); + if (request == nullptr) { + response["status"] = protocol::Status::BAD; + response["status_str"] = protocol::toString(protocol::Status::BAD); + response["status_ext"] = protocol::StatusExt::INVALID_ID; + response["status_ext_str"] = protocol::toString(protocol::StatusExt::INVALID_ID); + } else { + bool const includeResultIfFinished = true; + response = request->toJson(includeResultIfFinished); + } + return response; +} + +bool WorkerHttpProcessor::disposeRequest(string const& id) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << id); + + replica::Lock lock(_mtx, context); + + // Note that only the finished requests are allowed to be disposed. + if (auto itr = _finishedRequests.find(id); itr != _finishedRequests.end()) { + itr->second->dispose(); + _finishedRequests.erase(itr); + return true; + } + return false; +} + +size_t WorkerHttpProcessor::numNewRequests() const { + replica::Lock lock(_mtx, _context(__func__)); + return _newRequests.size(); +} + +size_t WorkerHttpProcessor::numInProgressRequests() const { + replica::Lock lock(_mtx, _context(__func__)); + return _inProgressRequests.size(); +} + +size_t WorkerHttpProcessor::numFinishedRequests() const { + replica::Lock lock(_mtx, _context(__func__)); + return _finishedRequests.size(); +} + +json WorkerHttpProcessor::toJson(protocol::Status status, bool includeRequests) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context); + + replica::Lock lock(_mtx, context); + + json response; + response["status"] = status; + response["status_str"] = protocol::toString(status); + response["status_ext"] = protocol::StatusExt::NONE; + response["status_ext_str"] = protocol::toString(protocol::StatusExt::NONE); + response["service_state"] = state(); + response["service_state_str"] = protocol::toString(state()); + response["num_new_requests"] = _newRequests.size(); + response["num_in_progress_requests"] = _inProgressRequests.size(); + response["num_finished_requests"] = _finishedRequests.size(); + response["new_requests"] = json::array(); + response["in_progress_requests"] = json::array(); + response["finished_requests"] = json::array(); + + if (includeRequests) { + for (auto const& request : _newRequests) { + response["new_requests"].push_back(request->toJson()); + } + for (auto const& entry : _inProgressRequests) { + response["in_progress_requests"].push_back(entry.second->toJson()); + } + for (auto const& entry : _finishedRequests) { + response["finished_requests"].push_back(entry.second->toJson()); + } + } + return response; +} + +string WorkerHttpProcessor::_classMethodContext(string const& func) { return "WorkerHttpProcessor::" + func; } + +void WorkerHttpProcessor::_logError(string const& context, string const& message) const { + LOGS(_log, LOG_LVL_ERROR, context << " " << message); +} + +shared_ptr WorkerHttpProcessor::_stopRequestImpl(replica::Lock const& lock, + string const& id) { + LOGS(_log, LOG_LVL_DEBUG, _context(__func__) << " id: " << id); + + // Still waiting in the queue? + // + // ATTENTION: the loop variable is a copy of (not a reference to) a shared + // pointer to allow removing (if needed) the corresponding entry from the + // input collection while retaining a valid copy of the pointer to be placed + // into the next stage collection. + + for (auto ptr : _newRequests) { + if (ptr->id() == id) { + // Cancel it and move it into the final queue in case if a client + // won't be able to receive the desired status of the request due to + // a protocol failure, etc. + ptr->cancel(); + switch (ptr->status()) { + case protocol::Status::CANCELLED: { + _newRequests.remove(id); + _finishedRequests[ptr->id()] = ptr; + return ptr; + } + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in new requests"); + } + } + } + + // Is it already being processed? + auto itrInProgress = _inProgressRequests.find(id); + if (itrInProgress != _inProgressRequests.end()) { + auto ptr = itrInProgress->second; + // Tell the request to begin the cancelling protocol. The protocol + // will take care of moving the request into the final queue when + // the cancellation will finish. + // + // At the meant time we just notify the client about the cancellation status + // of the request and let it come back later to check the updated status. + ptr->cancel(); + switch (ptr->status()) { + // These are the most typical states for request in this queue + case protocol::Status::CANCELLED: + case protocol::Status::IS_CANCELLING: + + // The following two states are also allowed here because + // in-progress requests are still allowed to progress to the completed + // states before reporting their new state via method: + // WorkerHttpProcessor::_processingFinished() + // Sometimes, the request just can't finish this in time due to + // replica::Lock lock(_mtx) held by the current method. We shouldn't worry + // about this situation here. The request will be moved into the next + // queue as soon as replica::Lock lock(_mtx) will be released. + case protocol::Status::SUCCESS: + case protocol::Status::FAILED: + return ptr; + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in in-progress requests"); + } + } + + // Has it finished? + auto itrFinished = _finishedRequests.find(id); + if (itrFinished != _finishedRequests.end()) { + auto ptr = itrFinished->second; + // There is nothing else we can do here other than just + // reporting the completion status of the request. It's up to a client + // to figure out what to do about this situation. + switch (ptr->status()) { + case protocol::Status::CANCELLED: + case protocol::Status::SUCCESS: + case protocol::Status::FAILED: + return ptr; + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in finished requests"); + } + } + + // No request found! + return nullptr; +} + +shared_ptr WorkerHttpProcessor::_trackRequestImpl(replica::Lock const& lock, + string const& id) { + LOGS(_log, LOG_LVL_DEBUG, _context(__func__) << " id: " << id); + + // Still waiting in the queue? + for (auto&& ptr : _newRequests) { + if (ptr->id() == id) { + switch (ptr->status()) { + // This state requirement is strict for the non-active requests + case protocol::Status::CREATED: + return ptr; + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in new requests"); + } + } + } + + // Is it already being processed? + auto itrInProgress = _inProgressRequests.find(id); + if (itrInProgress != _inProgressRequests.end()) { + auto ptr = itrInProgress->second; + switch (ptr->status()) { + // These are the most typical states for request in this queue + case protocol::Status::IS_CANCELLING: + case protocol::Status::IN_PROGRESS: + + // The following three states are also allowed here because + // in-progress requests are still allowed to progress to the completed + // states before reporting their new state via method: + // WorkerHttpProcessor::_processingFinished() + // Sometimes, the request just can't finish this in time due to + // replica::Lock lock(_mtx) held by the current method. We shouldn't worry + // about this situation here. The request will be moved into the next + // queue as soon as replica::Lock lock(_mtx) will be released. + case protocol::Status::CANCELLED: + case protocol::Status::SUCCESS: + case protocol::Status::FAILED: + return ptr; + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in in-progress requests"); + } + } + + // Has it finished? + auto itrFinished = _finishedRequests.find(id); + if (itrFinished != _finishedRequests.end()) { + auto ptr = itrFinished->second; + switch (ptr->status()) { + // This state requirement is strict for the completed requests + case protocol::Status::CANCELLED: + case protocol::Status::SUCCESS: + case protocol::Status::FAILED: + return ptr; + default: + throw logic_error(_classMethodContext(__func__) + " unexpected request status " + + protocol::toString(ptr->status()) + " in finished requests"); + } + } + + // No request found! + return nullptr; +} + +shared_ptr WorkerHttpProcessor::_fetchNextForProcessing( + shared_ptr const& processorThread, unsigned int timeoutMilliseconds) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_TRACE, + context << " thread: " << processorThread->id() << " timeout: " << timeoutMilliseconds); + + // For generating random intervals within the maximum range of seconds + // requested by a client. + // + // TODO: Re-implement this loop to use a condition variable instead. + // This will improve the performance of the processor which is limited + // by the half-latency of the wait interval. + util::BlockPost blockPost(0, min(10U, timeoutMilliseconds)); + + unsigned int totalElapsedTime = 0; + while (totalElapsedTime < timeoutMilliseconds) { + // IMPORTANT: make sure no wait is happening within the same + // scope where the thread safe block is defined. Otherwise + // the queue will be locked for all threads for the duration of + // the wait. + { + replica::Lock lock(_mtx, context); + if (!_newRequests.empty()) { + shared_ptr request = _newRequests.top(); + _newRequests.pop(); + request->start(); + _inProgressRequests[request->id()] = request; + return request; + } + } + totalElapsedTime += blockPost.wait(); + } + + // Return null pointer since noting has been found within the specified + // timeout. + return nullptr; +} + +void WorkerHttpProcessor::_processingRefused(shared_ptr const& request) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " id: " << request->id()); + + replica::Lock lock(_mtx, context); + + // Note that disposed requests won't be found in any queue. + auto itr = _inProgressRequests.find(request->id()); + if (itr != _inProgressRequests.end()) { + // Update request's state before moving it back into + // the input queue. + itr->second->stop(); + _newRequests.push(itr->second); + _inProgressRequests.erase(itr); + } +} + +void WorkerHttpProcessor::_processingFinished(shared_ptr const& request) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, + context << " id: " << request->id() << " status: " << protocol::toString(request->status())); + + replica::Lock lock(_mtx, context); + + // Note that disposed requests won't be found in any queue. + auto itr = _inProgressRequests.find(request->id()); + if (itr != _inProgressRequests.end()) { + _finishedRequests[itr->first] = itr->second; + _inProgressRequests.erase(itr); + } +} + +void WorkerHttpProcessor::_processorThreadStopped( + shared_ptr const& processorThread) { + string const context = _context(__func__); + LOGS(_log, LOG_LVL_DEBUG, context << " thread: " << processorThread->id()); + + replica::Lock lock(_mtx, context); + + if (_state == protocol::ServiceState::SUSPEND_IN_PROGRESS) { + // Complete state transition if all threads are stopped + for (auto&& t : _threads) { + if (t->isRunning()) return; + } + _state = protocol::ServiceState::SUSPENDED; + } +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerHttpProcessor.h b/src/replica/worker/WorkerHttpProcessor.h new file mode 100644 index 0000000000..60b80a8d96 --- /dev/null +++ b/src/replica/worker/WorkerHttpProcessor.h @@ -0,0 +1,367 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERHTTPPROCESSOR_H +#define LSST_QSERV_REPLICA_WORKERHTTPPROCESSOR_H + +// System headers +#include +#include +#include +#include +#include +#include +#include +#include + +// Qserv headers +#include "replica/proto/Protocol.h" +#include "replica/util/Mutex.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Third party headers +#include "nlohmann/json.hpp" + +// Forward declarations + +namespace lsst::qserv::replica { +class ServiceProvider; +class WorkerHttpProcessorThread; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::database::mysql { +class ConnectionPool; +} // namespace lsst::qserv::replica::database::mysql + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerHttpProcessor is a front-end interface for processing + * requests from remote clients within worker-side services. + */ +class WorkerHttpProcessor : public std::enable_shared_from_this { +public: + // The thread-based processor class is allowed to access the internal API + friend class WorkerHttpProcessorThread; + + /** + * Structure PriorityQueueType extends the standard priority queue for pointers + * to the new (unprocessed) requests. + * + * Its design relies upon the inheritance to get access to the protected + * data members 'c' representing the internal container of the base queue + * in order to implement the iterator protocol. + */ + struct PriorityQueueType + : std::priority_queue, + std::vector>, WorkerHttpRequestCompare> { + /// @return iterator to the beginning of the container + decltype(c.begin()) begin() { return c.begin(); } + + /// @return iterator to the end of the container + decltype(c.end()) end() { return c.end(); } + + /** + * Remove a request from the queue by its identifier + * @param id an identifier of a request + * @return 'true' if the object was actually removed + */ + bool remove(std::string const& id); + }; + + /** + * The factory method for objects of the class + * + * @param serviceProvider provider is needed to access the Configuration of + * a setup in order to get a number of the processing threads to be launched + * by the processor. + * @param worker the name of a worker + * @return a pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker); + + WorkerHttpProcessor() = delete; + WorkerHttpProcessor(WorkerHttpProcessor const&) = delete; + WorkerHttpProcessor& operator=(WorkerHttpProcessor const&) = delete; + + ~WorkerHttpProcessor() = default; + + /// @return the state of the processor + protocol::ServiceState state() const { return _state; } + + /// Begin processing requests + void run(); + + /// Stop processing all requests, and stop all threads + void stop(); + + /// Drain (cancel) all queued and in-progress requests + void drain(); + + /// Reload Configuration + void reconfig(); + + /** + * Enqueue the replica creation request for processing + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json createReplica(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue the replica deletion request for processing + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json deleteReplica(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue the replica lookup request for processing + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json findReplica(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue the multi-replica lookup request for processing + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json findAllReplicas(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue the worker-side testing request for processing + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json echo(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue a request for querying the worker database + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json sql(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Enqueue a request for extracting the "director" index data from + * the director tables. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + nlohmann::json index(protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req); + + /** + * Get a status of the request + * @param id an identifier of a request affected by the operation + * @return the response object to be sent back to a client + */ + nlohmann::json requestStatus(std::string const& id); + + /** + * Dequeue replication request + * @note If the request is not being processed yet then it will be simply removed + * from the ready-to-be-processed queue. If it's being processed an attempt + * to cancel processing will be made. If it has already processed this will + * be reported. + * @param id an identifier of a request affected by the operation + * @return the response object to be sent back to a client + */ + nlohmann::json stopRequest(std::string const& id); + + /** + * Return the tracking info on the on-going request + * @param id an identifier of a request affected by the operation + * @return the response object to be sent back to a client + */ + nlohmann::json trackRequest(std::string const& id); + + /** + * Find the request in any queue, and "garbage collect" it to release resources + * associated with the request. If the request is still in the "in-progress" + * state then it will be "drained" before disposing. If the request isn't found + * in any queue then nothing will happen (no exception thrown, no side effects). + * + * @param id an identifier of a request affected by the operation + * @return 'true' if the request was found and actually removed from any queue + */ + bool disposeRequest(std::string const& id); + + size_t numNewRequests() const; + size_t numInProgressRequests() const; + size_t numFinishedRequests() const; + + /** + * Capture the processor's state and counters. + * @param status desired status to set in the response objet + * @param includeRequests (optional) flag to return detailed info on all known requests + * @return the response object to be sent back to a client + */ + nlohmann::json toJson(protocol::Status status, bool includeRequests = false); + +private: + WorkerHttpProcessor(std::shared_ptr const& serviceProvider, std::string const& worker); + + static std::string _classMethodContext(std::string const& func); + + /** + * Submit a request for processing + * @param lock a lock on _mtx to be acquired before calling this method + * @param context the logging context (including the name of a function/method) + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @return the response object to be sent back to a client + */ + template + nlohmann::json _submit(replica::Lock const& lock, std::string const& context, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, Args... args) { + try { + auto const ptr = REQUEST_TYPE::create( + _serviceProvider, _worker, hdr, req, + [self = shared_from_this()](std::string const& id) { self->disposeRequest(id); }, + args...); + _newRequests.push(ptr); + return ptr->toJson(); + } catch (std::exception const& ec) { + _logError(context, ec.what()); + return nlohmann::json::object( + {{"status", protocol::Status::BAD}, + {"status_str", protocol::toString(protocol::Status::BAD)}, + {"status_ext", protocol::StatusExt::INVALID_PARAM}, + {"status_ext_str", protocol::toString(protocol::StatusExt::INVALID_PARAM)}}); + } + } + + /** + * Log the error message. + * @param context the logging context (including the name of a function/method) + * @param message the error message to be reported + */ + void _logError(std::string const& context, std::string const& message) const; + + /** + * Return the next request which is ready to be processed + * and if then one found assign it to the specified thread. The request + * will be removed from the ready-to-be-processed queue. + * + * If the one is available within the specified timeout then such request + * will be moved into the in-progress queue, assigned to the processor thread + * and returned to a caller. Otherwise an empty pointer (pointing to nullptr) + * will be returned. + * + * This method is supposed to be called by one of the processing threads + * when it becomes available. + * + * @note this method will block for a duration of time not exceeding + * the client-specified timeout unless it's set to 0. In the later + * case the method will block indefinitely. + * @param processorThread reference to a thread which fetches the next request + * @param timeoutMilliseconds (optional) amount of time to wait before to finish if + * no suitable requests are available for processing + */ + std::shared_ptr _fetchNextForProcessing( + std::shared_ptr const& processorThread, + unsigned int timeoutMilliseconds = 0); + + /** + * Implement the operation for the specified identifier if such request + * is still known to the Processor. Return a reference to the request object + * whose state will be properly updated. + * @param lock а lock on _mtx to be acquired before calling this method + * @param id an identifier of a request + * @return the request object (if found) or nullptr otherwise + */ + std::shared_ptr _stopRequestImpl(replica::Lock const& lock, std::string const& id); + + /** + * Find and return a reference to the request object. + * @param lock а lock on _mtx to be acquired before calling this method + * @param id an identifier of a request + * @return the request object (if found) or nullptr otherwise + */ + std::shared_ptr _trackRequestImpl(replica::Lock const& lock, std::string const& id); + + /** + * Report a decision not to process a request + * + * This method is supposed to be called by one of the processing threads + * after it fetches the next ready-to-process request and then decided + * not to proceed with processing. Normally this should happen when + * the thread was asked to stop. In that case the request will be put + * back into the ready-to-be processed request and be picked up later + * by some other thread. + * + * @param request a pointer to the request + */ + void _processingRefused(std::shared_ptr const& request); + + /** + * Report a request which has been processed or cancelled. + * + * The method is called by a thread which was processing the request. + * The request will be moved into the corresponding queue. A proper + * completion status is expected be stored within the request. + * + * @param request a pointer to the request + */ + void _processingFinished(std::shared_ptr const& request); + + /** + * For threads reporting their completion + * + * This method is used by threads to report a change in their state. + * It's meant to be used during the gradual and asynchronous state transition + * of this processor from the combined State::STATE_IS_STOPPING to + * State::STATE_IS_STOPPED. The later is achieved when all threads are stopped. + * + * @param processorThread reference to the processing thread which finished + */ + void _processorThreadStopped(std::shared_ptr const& processorThread); + + std::string _context(std::string const& func = std::string()) const { return "PROCESSOR " + func; } + + std::shared_ptr const _serviceProvider; + std::string const _worker; + std::shared_ptr const _connectionPool; + + protocol::ServiceState _state; + uint64_t _startTime; /// When the processor started (milliseconds since UNIX Epoch) + + std::vector> _threads; + + mutable replica::Mutex _mtx; /// Mutex guarding the queues + + PriorityQueueType _newRequests; + std::map> _inProgressRequests; + std::map> _finishedRequests; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERHTTPPROCESSOR_H diff --git a/src/replica/worker/WorkerHttpProcessorThread.cc b/src/replica/worker/WorkerHttpProcessorThread.cc new file mode 100644 index 0000000000..c2cd307d94 --- /dev/null +++ b/src/replica/worker/WorkerHttpProcessorThread.cc @@ -0,0 +1,121 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerHttpProcessorThread.h" + +// System headers +#include + +// Qserv headers +#include "replica/proto/Protocol.h" +#include "replica/worker/WorkerHttpProcessor.h" +#include "replica/worker/WorkerHttpRequest.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerHttpProcessorThread"); + +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerHttpProcessorThread::create( + shared_ptr const& processor) { + static unsigned int id = 0; + return shared_ptr(new WorkerHttpProcessorThread(processor, id++)); +} + +WorkerHttpProcessorThread::WorkerHttpProcessorThread(shared_ptr const& processor, + unsigned int id) + : _processor(processor), _id(id), _stop(false) {} + +bool WorkerHttpProcessorThread::isRunning() const { return _thread != nullptr; } + +void WorkerHttpProcessorThread::run() { + if (isRunning()) return; + + _thread = make_unique([self = shared_from_this()]() { + LOGS(_log, LOG_LVL_DEBUG, self->context() << "start"); + while (!self->_stop) { + // Get the next request to process if any. This operation will block + // until either the next request is available (returned a valid pointer) + // or the specified timeout expires. In either case this thread has a chance + // to re-evaluate the stopping condition. + auto const request = self->_processor->_fetchNextForProcessing(self, 1000); + if (self->_stop) { + if (request) self->_processor->_processingRefused(request); + continue; + } + if (request) { + LOGS(_log, LOG_LVL_DEBUG, + self->context() << "begin processing" + << " id: " << request->id()); + bool finished = false; // just to report the request completion + try { + while (!(finished = request->execute())) { + if (self->_stop) { + LOGS(_log, LOG_LVL_DEBUG, + self->context() << "rollback processing" + << " id: " << request->id()); + request->rollback(); + self->_processor->_processingRefused(request); + break; + } + } + } catch (WorkerHttpRequestCancelled const& ex) { + LOGS(_log, LOG_LVL_DEBUG, + self->context() << "cancel processing" + << " id: " << request->id()); + self->_processor->_processingFinished(request); + } + if (finished) { + LOGS(_log, LOG_LVL_DEBUG, + self->context() << "finish processing" + << " id: " << request->id() + << " status: " << protocol::toString(request->status())); + self->_processor->_processingFinished(request); + } + } + } + LOGS(_log, LOG_LVL_DEBUG, self->context() << "stop"); + + self->_stopped(); + }); + _thread->detach(); +} + +void WorkerHttpProcessorThread::stop() { + if (isRunning()) _stop = true; +} + +void WorkerHttpProcessorThread::_stopped() { + _stop = false; + _thread.reset(nullptr); + _processor->_processorThreadStopped(shared_from_this()); +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerHttpProcessorThread.h b/src/replica/worker/WorkerHttpProcessorThread.h new file mode 100644 index 0000000000..388a30fafb --- /dev/null +++ b/src/replica/worker/WorkerHttpProcessorThread.h @@ -0,0 +1,113 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERHTTPPROCESSORTHREAD_H +#define LSST_QSERV_REPLICA_WORKERHTTPPROCESSORTHREAD_H + +// System headers +#include +#include +#include +#include + +// Forward declarations +namespace lsst::qserv::replica { +class WorkerHttpProcessor; +} // namespace lsst::qserv::replica + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerHttpProcessorThread is a thread-based request processing engine + * for replication requests within worker-side services. + */ +class WorkerHttpProcessorThread : public std::enable_shared_from_this { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param processor A pointer to the processor which launched this thread. This pointer + * will be used for making call backs to the processor on the completed or rejected requests. + * @return a pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& processor); + + WorkerHttpProcessorThread() = delete; + WorkerHttpProcessorThread(WorkerHttpProcessorThread const&) = delete; + WorkerHttpProcessorThread& operator=(WorkerHttpProcessorThread const&) = delete; + + ~WorkerHttpProcessorThread() = default; + + /// @return identifier of this thread object + unsigned int id() const { return _id; } + + /// @return 'true' if the processing thread is still running + bool isRunning() const; + + /** + * Create and run the thread (if none is still running) fetching + * and processing requests until method stop() is called. + */ + void run(); + + /** + * Tell the running thread to abort processing the current + * request (if any), put that request back into the input queue, + * stop fetching new requests and finish. The thread can be resumed + * later by calling method run(). + * + * @note This is an asynchronous operation. + */ + void stop(); + + /// @return context string for logs + std::string context() const { return "THREAD: " + std::to_string(_id) + " "; } + +private: + /// @see WorkerHttpProcessorThread::create() + WorkerHttpProcessorThread(std::shared_ptr const& processor, unsigned int id); + + /** + * Event handler called by the thread when it's about to stop + */ + void _stopped(); + + // Input parameters + + std::shared_ptr const _processor; + + /// The identifier of this thread object + unsigned int const _id; + + /// The processing thread is created on demand when calling method run() + std::unique_ptr _thread; + + /// The flag to be raised to tell the running thread to stop. + /// The thread will reset this flag when it finishes. + std::atomic _stop; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERHTTPPROCESSORTHREAD_H diff --git a/src/replica/worker/WorkerHttpRequest.cc b/src/replica/worker/WorkerHttpRequest.cc new file mode 100644 index 0000000000..153c0b257d --- /dev/null +++ b/src/replica/worker/WorkerHttpRequest.cc @@ -0,0 +1,277 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerHttpRequest.h" + +// System headers +#include + +// Third party headers +#include "boost/date_time/posix_time/posix_time.hpp" + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/services/ServiceProvider.h" + +// LSST headers +#include "lsst/log/Log.h" + +#define CONTEXT context("WorkerHttpRequest", __func__) + +using namespace std; +using namespace std::placeholders; +using json = nlohmann::json; + +namespace { +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerHttpRequest"); +} // namespace + +namespace lsst::qserv::replica { + +replica::Mutex WorkerHttpRequest::_mtxDataFolderOperations; + +atomic WorkerHttpRequest::_numInstances{0}; + +WorkerHttpRequest::WorkerHttpRequest(shared_ptr const& serviceProvider, string const& worker, + string const& type, protocol::QueuedRequestHdr const& hdr, + json const& req, ExpirationCallbackType const& onExpired) + : _serviceProvider(serviceProvider), + _worker(worker), + _type(type), + _hdr(hdr), + _req(req), + _onExpired(onExpired), + _expirationTimeoutSec(hdr.timeout == 0 ? serviceProvider->config()->get( + "controller", "request-timeout-sec") + : hdr.timeout), + _expirationTimer(serviceProvider->io_service()), + _status(protocol::Status::CREATED), + _extendedStatus(protocol::StatusExt::NONE), + _performance() { + _numInstances++; + LOGS(_log, LOG_LVL_TRACE, CONTEXT << " numInstances: " << _numInstances); +} + +WorkerHttpRequest::~WorkerHttpRequest() { + _numInstances--; + LOGS(_log, LOG_LVL_TRACE, CONTEXT << " numInstances: " << _numInstances); + dispose(); +} + +void WorkerHttpRequest::checkIfCancelling(replica::Lock const& lock, string const& context_) { + switch (status()) { + case protocol::Status::IN_PROGRESS: + break; + case protocol::Status::IS_CANCELLING: + setStatus(lock, protocol::Status::CANCELLED); + throw WorkerHttpRequestCancelled(); + default: + throw logic_error(CONTEXT + " not allowed while in status: " + protocol::toString(status())); + } +} + +WorkerHttpRequest::ErrorContext WorkerHttpRequest::reportErrorIf(bool errorCondition, + protocol::StatusExt extendedStatus, + string const& errorMsg) { + WorkerHttpRequest::ErrorContext errorContext; + if (errorCondition) { + errorContext.failed = true; + errorContext.extendedStatus = extendedStatus; + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " execute" << errorMsg); + } + return errorContext; +} + +void WorkerHttpRequest::init() { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + if (status() != protocol::Status::CREATED) return; + + // Start the expiration timer + if (_expirationTimeoutSec != 0) { + _expirationTimer.cancel(); + _expirationTimer.expires_from_now(boost::posix_time::seconds(_expirationTimeoutSec)); + _expirationTimer.async_wait(bind(&WorkerHttpRequest::_expired, shared_from_this(), _1)); + LOGS(_log, LOG_LVL_TRACE, + CONTEXT << " started timer with _expirationTimeoutSec: " << _expirationTimeoutSec); + } +} + +void WorkerHttpRequest::start() { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + switch (status()) { + case protocol::Status::CREATED: + setStatus(lock, protocol::Status::IN_PROGRESS); + break; + default: + throw logic_error(CONTEXT + " not allowed while in status: " + protocol::toString(status())); + } +} + +void WorkerHttpRequest::cancel() { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + switch (status()) { + case protocol::Status::QUEUED: + case protocol::Status::CREATED: + case protocol::Status::CANCELLED: + setStatus(lock, protocol::Status::CANCELLED); + break; + case protocol::Status::IN_PROGRESS: + case protocol::Status::IS_CANCELLING: + setStatus(lock, protocol::Status::IS_CANCELLING); + break; + + // Nothing to be done to the completed requests + case protocol::Status::SUCCESS: + case protocol::Status::BAD: + case protocol::Status::FAILED: + break; + } +} + +void WorkerHttpRequest::rollback() { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + switch (status()) { + case protocol::Status::CREATED: + case protocol::Status::IN_PROGRESS: + setStatus(lock, protocol::Status::CREATED); + break; + case protocol::Status::IS_CANCELLING: + setStatus(lock, protocol::Status::CANCELLED); + throw WorkerHttpRequestCancelled(); + break; + default: + throw logic_error(CONTEXT + " not allowed while in status: " + protocol::toString(status())); + } +} + +void WorkerHttpRequest::stop() { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + setStatus(lock, protocol::Status::CREATED); +} + +void WorkerHttpRequest::dispose() noexcept { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + replica::Lock lock(_mtx, CONTEXT); + if (_expirationTimeoutSec != 0) { + try { + _expirationTimer.cancel(); + } catch (exception const& ex) { + LOGS(_log, LOG_LVL_WARN, + CONTEXT << " request expiration couldn't be cancelled, ex: " << ex.what()); + } + } +} + +json WorkerHttpRequest::toJson(bool includeResultIfFinished) const { + LOGS(_log, LOG_LVL_TRACE, CONTEXT); + + // IMPORTANT: the lock is not needed here because the data read by the method + // are safe to read w/o any synchronization. The only exception is the results + // which is not a problem since results are only read after the request is finished. + + json response = _hdr.toJson(); + response["req"] = _req; + response["type"] = _type; + response["status"] = _status.load(); + response["status_str"] = protocol::toString(_status.load()); + response["status_ext"] = _extendedStatus.load(); + response["status_ext_str"] = protocol::toString(_extendedStatus.load()); + response["expiration_timeout_sec"] = _expirationTimeoutSec; + response["performance"] = _performance.toJson(); + response["result"] = json::object(); + if (includeResultIfFinished && _status == protocol::Status::SUCCESS) { + getResult(response["result"]); + } + return response; +} + +string WorkerHttpRequest::context(string const& className, string const& func) const { + return id() + " " + type() + " " + protocol::toString(status()) + " " + className + "::" + func; +} + +void WorkerHttpRequest::setStatus(replica::Lock const& lock, protocol::Status status, + protocol::StatusExt extendedStatus) { + LOGS(_log, LOG_LVL_TRACE, + CONTEXT << " " << protocol::toString(_status, _extendedStatus) << " -> " + << protocol::toString(status, extendedStatus)); + switch (status) { + case protocol::Status::CREATED: + _performance.start_time = 0; + _performance.finish_time = 0; + break; + case protocol::Status::IN_PROGRESS: + _performance.setUpdateStart(); + _performance.finish_time = 0; + break; + case protocol::Status::IS_CANCELLING: + break; + case protocol::Status::CANCELLED: + + // Set the start time to some meaningful value in case if the request was + // cancelled while sitting in the input queue + if (0 == _performance.start_time) _performance.setUpdateStart(); + _performance.setUpdateFinish(); + break; + + case protocol::Status::SUCCESS: + case protocol::Status::FAILED: + _performance.setUpdateFinish(); + break; + default: + throw logic_error(CONTEXT + " unhandled status: " + protocol::toString(status)); + } + + // ATTENTION: the top-level status is the last to be modified in + // the state transition to ensure clients will see a consistent state + // of the object. + _extendedStatus = extendedStatus; + _status = status; +} + +void WorkerHttpRequest::_expired(boost::system::error_code const& ec) { + LOGS(_log, LOG_LVL_TRACE, + CONTEXT << (ec == boost::asio::error::operation_aborted ? " ** ABORTED **" : "")); + + replica::Lock lock(_mtx, CONTEXT); + + // Clearing the stored callback after finishing the up-stream notification + // has two purposes: + // + // 1. it guaranties no more than one time notification + // 2. it breaks the up-stream dependency on a caller object if a shared + // pointer to the object was mentioned as the lambda-function's closure + + // Ignore this event if the timer was aborted + if (ec != boost::asio::error::operation_aborted) { + if (_onExpired != nullptr) { + serviceProvider()->io_service().post(bind(move(_onExpired), _hdr.id)); + } + } + _onExpired = nullptr; +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerHttpRequest.h b/src/replica/worker/WorkerHttpRequest.h new file mode 100644 index 0000000000..6b9921e985 --- /dev/null +++ b/src/replica/worker/WorkerHttpRequest.h @@ -0,0 +1,352 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERHTTPREQUEST_H + +// System headers +#include +#include +#include +#include +#include + +// Third party headers +#include "boost/asio.hpp" + +// Qserv headers +#include "replica/proto/Protocol.h" +#include "replica/util/Common.h" +#include "replica/util/Mutex.h" +#include "replica/util/Performance.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Structure WorkerHttpRequestCancelled represent an exception thrown when + * a replication request is cancelled + */ +class WorkerHttpRequestCancelled : public std::exception { +public: + /// @return a short description of the exception + char const* what() const noexcept override { return "cancelled"; } +}; + +/** + * Class WorkerHttpRequest is the base class for a family of the worker-side + * requests which require non-deterministic interactions with the server's + * environment (network, disk I/O, etc.). Generally speaking, all requests + * which can't be implemented instantaneously fall into this category. + */ +class WorkerHttpRequest : public std::enable_shared_from_this { +public: + /// The function type for notifications on the expiration of the request + /// given its unique identifier. + typedef std::function ExpirationCallbackType; + + WorkerHttpRequest() = delete; + WorkerHttpRequest(WorkerHttpRequest const&) = delete; + WorkerHttpRequest& operator=(WorkerHttpRequest const&) = delete; + + /// Destructor (can't 'override' because the base class's one is not virtual) + /// Also, non-trivial destructor is needed to stop the request expiration + /// timer (if any was started by the constructor). + virtual ~WorkerHttpRequest(); + + std::shared_ptr const& serviceProvider() const { return _serviceProvider; } + std::string const& worker() const { return _worker; } + std::string const& type() const { return _type; } + std::string const& id() const { return _hdr.id; } + int priority() const { return _hdr.priority; } + nlohmann::json const& req() const { return _req; } + protocol::Status status() const { return _status; } + protocol::StatusExt extendedStatus() const { return _extendedStatus; } + + WorkerPerformance const& performance() const { return _performance; } + + /** + * This method is called from the initial state protocol::Status::CREATED in order + * to start the request expiration timer. It's safe to call this operation + * multiple times. Each invocation of the method will result in cancelling + * the previously set timer (if any) and starting a new one. + */ + void init(); + + /** + * This method is called from the initial state protocol::Status::CREATED in order + * to prepare the request for processing (to respond to methods 'execute', + * 'cancel', 'rollback' or 'reset'. The final state upon the completion + * of the method should be protocol::Status::IN_PROGRESS. + */ + void start(); + + /** + * This method should be invoked (repeatedly) to execute the request until + * it returns 'true' or throws an exception. Note that returning 'true' + * may mean both success or failure, depending on the completion status + * of the request. + * + * This method is required to be called while the request state is protocol::Status::IN_PROGRESS. + * The method will throw custom exception WorkerHttpRequestCancelled when it detects a cancellation + * request. + * + * @return result of the operation as explained above + */ + virtual bool execute() = 0; + + /** + * Cancel execution of the request. + * + * The effect of the operation varies depending on the current state of + * the request. The default (the base class's implementation) assumes + * the following transitions: + * + * {protocol::Status::CREATED,protocol::Status::CANCELLED} -> protocol::Status::CANCELLED + * {protocol::Status::IN_PROGRESS,protocol::Status::IS_CANCELLING} -> protocol::Status::IS_CANCELLING + * {*} -> throw std::logic_error + */ + virtual void cancel(); + + /** + * Roll back the request into its initial state and cleanup partial results + * if possible. + * + * The effect of the operation varies depending on the current state of + * the request. The default (the base class's implementation) assumes + * the following transitions: + * + * {protocol::Status::CREATED, protocol::Status::IN_PROGRESS} -> protocol::Status::CREATED + * {protocol::Status::IS_CANCELLING} -> protocol::Status::CANCELLED -> throw WorkerHttpRequestCancelled + * {*} -> throw std::logic_error + */ + virtual void rollback(); + + /** + * This method is called from *ANY* initial state in order to turn + * the request back into the initial protocol::Status::CREATED. + */ + void stop(); + + /** + * This method should be used to cancel the request expiration timer. + * Normally this method is initiated during the external "garbage collection" + * of requests to ensure all resources (including a copy of a smart pointer onto + * objects of the request classes) held by timers get released. + * + * @note this method won't throw any exceptions so that it could + * be invoked from the destructor. All exceptions (should they + * occur during an execution of the method) will be intersected + * and reported as errors to the message logger. + */ + void dispose() noexcept; + + /** + * Extract the extra data from the request and put it into the response object. + * @param includeResultIfFinished (optional) flag to include results if the request has finished + */ + nlohmann::json toJson(bool includeResultIfFinished = false) const; + + /// @return the context string + std::string context(std::string const& className, std::string const& func) const; + +protected: + /** + * The normal constructor of the class + * + * @param serviceProvider provider is needed to access the Configuration of + * a setup and for validating the input parameters + * @param worker the name of a worker. It must be the same worker as the one + * where the request is going to be processed. + * @param type the type name of a request + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @throws std::invalid_argument if the worker is unknown + */ + WorkerHttpRequest(std::shared_ptr const& serviceProvider, std::string const& worker, + std::string const& type, protocol::QueuedRequestHdr const& hdr, + nlohmann::json const& req, ExpirationCallbackType const& onExpired); + + /** + * The method is used to check if the request is entered the cancellation state. + * The implementation assumes the following transitions: + * + * {protocol::Status::IN_PROGRESS} -> protocol::Status::IN_PROGRESS + * {protocol::Status::IS_CANCELLING} -> protocol::Status::CANCELLED -> throw WorkerHttpRequestCancelled + * {*} -> throw std::logic_error + * + * @param lock a lock on _mtx which acquired before calling this method + * @param context_ a scope class/method from where the method was called + * @throws WorkerHttpRequestCancelled if the request is being cancelled. + * @throws std::logic_error if the state is not as expected. + */ + void checkIfCancelling(replica::Lock const& lock, std::string const& context_); + + /** Set the status + * + * @note this method needs to be called within a thread-safe context + * when moving requests between different queues. + * + * @param lock a lock which acquired before calling this method + * @param status primary status to be set + * @param extendedStatus secondary status to be set + */ + void setStatus(replica::Lock const& lock, protocol::Status status, + protocol::StatusExt extendedStatus = protocol::StatusExt::NONE); + + /** + * Fill in the information object for the specified request based on its + * actual type. + * @param result an object to be filled + */ + virtual void getResult(nlohmann::json& result) const = 0; + + /** + * Structure ErrorContext is used for tracking errors reported by + * method 'reportErrorIf + */ + struct ErrorContext { + // State of the object + bool failed; + protocol::StatusExt extendedStatus; + + ErrorContext() : failed(false), extendedStatus(protocol::StatusExt::NONE) {} + + /** + * Merge the context of another object into the current one. + * + * @note Only the first error code will be stored when a error condition + * is detected. An assumption is that the first error would usually cause + * a "chain reaction", hence only the first one typically matters. + * Other details could be found in the log files if needed. + * @param ErrorContext input context to be merged with the current state + */ + ErrorContext& operator||(const ErrorContext& rhs) { + if (&rhs != this) { + if (rhs.failed and not failed) { + failed = true; + extendedStatus = rhs.extendedStatus; + } + } + return *this; + } + }; + + /** + * Check if the error condition is set and report the error. + * The error message will be sent to the corresponding logging + * stream. + * + * @param condition if set to 'true' then there is a error condition + * @param extendedStatus extended status corresponding to the condition + * (will be ignored if no error condition is present) + * @param errorMsg a message to be reported into the log stream + * @return the context object encapsulating values passed in parameters + * 'condition' and 'extendedStatus' + */ + ErrorContext reportErrorIf(bool condition, protocol::StatusExt extendedStatus, + std::string const& errorMsg); + + /// Return shared pointer of the desired subclass (no dynamic type checking) + template + std::shared_ptr shared_from_base() { + return std::static_pointer_cast(shared_from_this()); + } + + // Input parameters + + std::shared_ptr const _serviceProvider; + + std::string const _worker; + std::string const _type; + protocol::QueuedRequestHdr const _hdr; + nlohmann::json const _req; + + ExpirationCallbackType _onExpired; ///< The callback is reset when the request gets expired + /// or explicitly disposed. + unsigned int const _expirationTimeoutSec; + + /// This timer is used (if configured) to limit the total duration of time + /// a request could exist from its creation till termination. The timer + /// starts when the request gets created. And it's explicitly finished when + /// a request object gets destroyed. + /// + /// If the time has a chance to expire then the request expiration callback + /// (if any) passed into the constructor will be invoked to notify WorkerProcessor + /// on the expiration event. + boost::asio::deadline_timer _expirationTimer; + + // 2-layer state of a request + + std::atomic _status; + std::atomic _extendedStatus; + + /// Performance counters + WorkerPerformance _performance; + + /// Mutex guarding API calls where it's needed + mutable replica::Mutex _mtx; + + /// Mutex guarding operations with the worker's data folder + static replica::Mutex _mtxDataFolderOperations; + +private: + /** + * Request expiration timer's handler. The expiration interval (if any) + * is obtained from the Controller-side requests or obtained from + * the configuration service. When the request expires (and if the timer + * is not aborted due to request disposal) then an upstream callback + * is invoked. + * + * @param ec error code to be checked to see if the time was aborted + * by the explicit request disposal operation. + */ + void _expired(boost::system::error_code const& ec); + + // For memory usage monitoring and memory leak diagnostic. + static std::atomic _numInstances; +}; + +/** + * Structure WorkerHttpRequestCompare is a functor representing a comparison type + * for strict weak ordering required by std::priority_queue + */ +struct WorkerHttpRequestCompare { + /** + * Sort requests by their priorities + * @param lhs pointer to a request on the left side of a logical comparison + * @param rhs pointer to a request on the right side of a logical comparison + * @return 'true' if the priority of 'lhs' is strictly less than the one of 'rhs' + */ + bool operator()(std::shared_ptr const& lhs, + std::shared_ptr const& rhs) const { + return lhs->priority() < rhs->priority(); + } +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERHTTPREQUEST_H diff --git a/src/replica/worker/WorkerHttpSvc.cc b/src/replica/worker/WorkerHttpSvc.cc new file mode 100644 index 0000000000..249a2b9c55 --- /dev/null +++ b/src/replica/worker/WorkerHttpSvc.cc @@ -0,0 +1,149 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerHttpSvc.h" + +// System headers +#include +#include + +// Qserv headers +#include "http/ChttpMetaModule.h" +#include "replica/config/Configuration.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/Common.h" +#include "replica/worker/WorkerHttpProcessor.h" +#include "replica/worker/WorkerHttpSvcMod.h" + +// LSST headers +#include "lsst/log/Log.h" + +// Third party headers +#include "httplib.h" +#include "nlohmann/json.hpp" + +using namespace nlohmann; +using namespace std; + +namespace { +string const context_ = "WORKER-HTTP-SVC "; +LOG_LOGGER _log = LOG_GET("lsst.qserv.worker.WorkerHttpSvc"); +} // namespace + +namespace lsst::qserv::replica { + +shared_ptr WorkerHttpSvc::create(shared_ptr const& serviceProvider, + string const& workerName) { + return shared_ptr(new WorkerHttpSvc(serviceProvider, workerName)); +} + +WorkerHttpSvc::WorkerHttpSvc(shared_ptr const& serviceProvider, string const& workerName) + : ChttpSvc(context_, serviceProvider, + serviceProvider->config()->get("worker", "http-svc-port"), + serviceProvider->config()->get("worker", "http-svc-max-queued-requests"), + serviceProvider->config()->get("worker", "num-http-svc-threads")), + _workerName(workerName), + _processor(WorkerHttpProcessor::create(serviceProvider, workerName)) { + // Start the processor to allow processing requests. + _processor->run(); +} + +void WorkerHttpSvc::registerServices(unique_ptr const& server) { + throwIf(server == nullptr, context_ + "the server is not initialized"); + auto const self = shared_from_base(); + server->Get("/meta/version", [self](httplib::Request const& req, httplib::Response& resp) { + json const info = json::object({{"kind", "replication-worker-svc"}, + {"id", self->_workerName}, + {"instance_id", self->serviceProvider()->instanceId()}}); + http::ChttpMetaModule::process(context_, info, req, resp, "VERSION"); + }); + server->Post("/worker/echo", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "ECHO", http::AuthType::REQUIRED); + }); + server->Post("/worker/replica/create", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REPLICA-CREATE", http::AuthType::REQUIRED); + }); + server->Post("/worker/replica/delete", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REPLICA-DELETE", http::AuthType::REQUIRED); + }); + server->Post("/worker/replica/find", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REPLICA-FIND", http::AuthType::REQUIRED); + }); + server->Post("/worker/replica/find-all", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REPLICA-FIND-ALL", http::AuthType::REQUIRED); + }); + server->Post("/worker/index", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "INDEX", http::AuthType::REQUIRED); + }); + server->Post("/worker/sql", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SQL", http::AuthType::REQUIRED); + }); + server->Get("/worker/request/track/:id", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REQUEST-TRACK"); + }); + server->Get("/worker/request/status/:id", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REQUEST-STATUS"); + }); + server->Put("/worker/request/stop/:id", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REQUEST-STOP", http::AuthType::REQUIRED); + }); + server->Put("/worker/request/dispose", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "REQUEST-DISPOSE", http::AuthType::REQUIRED); + }); + server->Get("/worker/service/status", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-STATUS"); + }); + server->Get("/worker/service/requests", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-REQUESTS"); + }); + server->Put("/worker/service/suspend", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-SUSPEND", http::AuthType::REQUIRED); + }); + server->Put("/worker/service/resume", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-RESUME", http::AuthType::REQUIRED); + }); + server->Put("/worker/service/drain", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-DRAIN", http::AuthType::REQUIRED); + }); + server->Put("/worker/service/reconfig", [self](httplib::Request const& req, httplib::Response& resp) { + WorkerHttpSvcMod::process(self->serviceProvider(), self->_processor, self->_workerName, req, resp, + "SERVICE-RECONFIG", http::AuthType::REQUIRED); + }); +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerHttpSvc.h b/src/replica/worker/WorkerHttpSvc.h new file mode 100644 index 0000000000..0e204649e4 --- /dev/null +++ b/src/replica/worker/WorkerHttpSvc.h @@ -0,0 +1,84 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERHTTPSVC_H +#define LSST_QSERV_REPLICA_WORKERHTTPSVC_H + +// System headers +#include +#include + +// Qserv headers +#include "replica/util/ChttpSvc.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +class WorkerHttpProcessor; +} // namespace lsst::qserv::replica + +namespace httplib { +class Server; +} // namespace httplib + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerHttpSvc is the HTTP frontend to the Replication Worker Service. + * Each instance of this class will be running in its own thread. + */ +class WorkerHttpSvc : public ChttpSvc { +public: + /** + * Create an instance of the service. + * + * @param serviceProvider For configuration, etc. services. + * @param workerName The name of a worker this service is acting upon (used for + * checking consistency of the protocol). + * @return A pointer to the created object. + */ + static std::shared_ptr create(std::shared_ptr const& serviceProvider, + std::string const& workerName); + + WorkerHttpSvc() = delete; + WorkerHttpSvc(WorkerHttpSvc const&) = delete; + WorkerHttpSvc& operator=(WorkerHttpSvc const&) = delete; + + virtual ~WorkerHttpSvc() = default; + +protected: + /// @see HttpSvc::registerServices() + virtual void registerServices(std::unique_ptr const& server) override; + +private: + /// @see WorkerHttpSvc::create() + WorkerHttpSvc(std::shared_ptr const& serviceProvider, std::string const& workerName); + + // Input parameters + std::string const _workerName; + + /// The request processor. + std::shared_ptr _processor; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERHTTPSVC_H diff --git a/src/replica/worker/WorkerHttpSvcMod.cc b/src/replica/worker/WorkerHttpSvcMod.cc new file mode 100644 index 0000000000..2d8a856567 --- /dev/null +++ b/src/replica/worker/WorkerHttpSvcMod.cc @@ -0,0 +1,244 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerHttpSvcMod.h" + +// System headers +#include + +// Third-party headers +#include + +// Qserv header +#include "http/Method.h" +#include "replica/proto/Protocol.h" +#include "replica/worker/WorkerHttpProcessor.h" +#include "replica/services/ServiceProvider.h" + +using namespace std; +using json = nlohmann::json; + +namespace lsst::qserv::replica { + +void WorkerHttpSvcMod::process(shared_ptr const& serviceProvider, + shared_ptr const& processor, string const& workerName, + httplib::Request const& req, httplib::Response& resp, + string const& subModuleName, http::AuthType const authType) { + WorkerHttpSvcMod module(serviceProvider, processor, workerName, req, resp); + module.execute(subModuleName, authType); +} + +WorkerHttpSvcMod::WorkerHttpSvcMod(shared_ptr const& serviceProvider, + shared_ptr const& processor, string const& workerName, + httplib::Request const& req, httplib::Response& resp) + : http::ChttpModule(serviceProvider->authKey(), serviceProvider->adminAuthKey(), req, resp), + _serviceProvider(serviceProvider), + _processor(processor), + _workerName(workerName) {} + +string WorkerHttpSvcMod::context() const { return "WORKER-HTTP-SVC "; } + +json WorkerHttpSvcMod::executeImpl(string const& subModuleName) { + debug(__func__, "subModuleName: '" + subModuleName + "'"); + enforceInstanceId(__func__, _serviceProvider->instanceId()); + if (subModuleName == "ECHO") + return _echo(); + else if (subModuleName == "REPLICA-CREATE") + return _replicaCreate(); + else if (subModuleName == "REPLICA-DELETE") + return _replicaDelete(); + else if (subModuleName == "REPLICA-FIND") + return _replicaFind(); + else if (subModuleName == "REPLICA-FIND-ALL") + return _replicaFindAll(); + else if (subModuleName == "SQL") + return _sql(); + else if (subModuleName == "INDEX") + return _index(); + else if (subModuleName == "REQUEST-TRACK") + return _requestTrack(); + else if (subModuleName == "REQUEST-STATUS") + return _requestStatus(); + else if (subModuleName == "REQUEST-STOP") + return _requestStop(); + else if (subModuleName == "REQUEST-DISPOSE") + return _requestDispose(); + else if (subModuleName == "SERVICE-SUSPEND") + return _serviceSuspend(); + else if (subModuleName == "SERVICE-RESUME") + return _serviceResume(); + else if (subModuleName == "SERVICE-STATUS") + return _serviceStatus(); + else if (subModuleName == "SERVICE-REQUESTS") + return _serviceRequests(); + else if (subModuleName == "SERVICE-DRAIN") + return _serviceDrain(); + else if (subModuleName == "SERVICE-RECONFIG") + return _serviceReconfig(); + throw invalid_argument(context() + "::" + string(__func__) + " unsupported sub-module: '" + + subModuleName + "'"); +} + +protocol::QueuedRequestHdr WorkerHttpSvcMod::_parseHdr(string const& func) const { + protocol::QueuedRequestHdr const hdr(body().required("id"), body().optional("priority", 0), + body().optional("timeout", 0)); + debug(func, "id: '" + hdr.id + "'"); + debug(func, "priority: " + to_string(hdr.priority)); + debug(func, "timeout: " + to_string(hdr.timeout)); + return hdr; +} + +json WorkerHttpSvcMod::_echo() const { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->echo(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_replicaCreate() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->createReplica(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_replicaDelete() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->deleteReplica(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_replicaFind() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->findReplica(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_replicaFindAll() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->findAllReplicas(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_index() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->index(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_sql() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->sql(_parseHdr(__func__), body().required("req")); +} + +json WorkerHttpSvcMod::_requestTrack() { + debug(__func__); + checkApiVersion(__func__, 40); + string const id = params().at("id"); + debug(__func__, "id: '" + id + "'"); + return _processor->trackRequest(id); +} + +json WorkerHttpSvcMod::_requestStatus() { + debug(__func__); + checkApiVersion(__func__, 40); + string const id = params().at("id"); + debug(__func__, "id: '" + id + "'"); + return _processor->requestStatus(id); +} + +json WorkerHttpSvcMod::_requestStop() { + debug(__func__); + checkApiVersion(__func__, 40); + string const id = params().at("id"); + debug(__func__, "id: '" + id + "'"); + return _processor->stopRequest(id); +} + +json WorkerHttpSvcMod::_requestDispose() { + debug(__func__); + checkApiVersion(__func__, 40); + auto const idsJson = body().required("ids"); + if (!idsJson.is_array()) + throw invalid_argument(context() + "::" + string(__func__) + " 'ids' is not an array"); + + json idsDisposedJson = json::object(); + for (auto const& idJson : idsJson) { + string const id = idJson.get(); + idsDisposedJson[id] = _processor->disposeRequest(id) ? 1 : 0; + } + return json::object({{"status", protocol::Status::SUCCESS}, + {"status_str", protocol::toString(protocol::Status::SUCCESS)}, + {"status_ext", protocol::StatusExt::NONE}, + {"status_ext_str", protocol::toString(protocol::StatusExt::NONE)}, + {"ids_disposed", idsDisposedJson}}); +} + +json WorkerHttpSvcMod::_serviceSuspend() { + debug(__func__); + checkApiVersion(__func__, 40); + + // This operation is allowed to be asynchronous as it may take + // extra time for the processor's threads to finish on-going processing + _processor->stop(); + return _processor->toJson(_processor->state() == protocol::ServiceState::RUNNING + ? protocol::Status::FAILED + : protocol::Status::SUCCESS); +} + +json WorkerHttpSvcMod::_serviceResume() { + debug(__func__); + checkApiVersion(__func__, 40); + _processor->run(); + return _processor->toJson(_processor->state() == protocol::ServiceState::RUNNING + ? protocol::Status::SUCCESS + : protocol::Status::FAILED); +} + +json WorkerHttpSvcMod::_serviceStatus() { + debug(__func__); + checkApiVersion(__func__, 40); + return _processor->toJson(protocol::Status::SUCCESS); +} + +json WorkerHttpSvcMod::_serviceRequests() { + debug(__func__); + checkApiVersion(__func__, 40); + const bool includeRequests = true; + return _processor->toJson(protocol::Status::SUCCESS, includeRequests); +} + +json WorkerHttpSvcMod::_serviceDrain() { + debug(__func__); + checkApiVersion(__func__, 40); + _processor->drain(); + const bool includeRequests = true; + return _processor->toJson(protocol::Status::SUCCESS, includeRequests); +} + +json WorkerHttpSvcMod::_serviceReconfig() { + debug(__func__); + checkApiVersion(__func__, 40); + _processor->reconfig(); + return _processor->toJson(protocol::Status::SUCCESS); +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerHttpSvcMod.h b/src/replica/worker/WorkerHttpSvcMod.h new file mode 100644 index 0000000000..bf72ad0bd3 --- /dev/null +++ b/src/replica/worker/WorkerHttpSvcMod.h @@ -0,0 +1,172 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_WORKERHTTPSVCMOD_H +#define LSST_QSERV_WORKERHTTPSVCMOD_H + +// System headers +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "http/ChttpModule.h" + +// Forward declarations + +namespace lsst::qserv::replica { +class ServiceProvider; +class WorkerHttpProcessor; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::protocol { +struct QueuedRequestHdr; +} // namespace lsst::qserv::replica::protocol + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerHttpSvcMod processes the Replication Controller's requests. + * The class is used by the HTTP server built into the worker Replication service. + */ +class WorkerHttpSvcMod : public http::ChttpModule { +public: + WorkerHttpSvcMod() = delete; + WorkerHttpSvcMod(WorkerHttpSvcMod const&) = delete; + WorkerHttpSvcMod& operator=(WorkerHttpSvcMod const&) = delete; + + virtual ~WorkerHttpSvcMod() = default; + + /** + * Process a request. + * + * Supported values for parameter 'subModuleName': + * + * ECHO for testing the worker-side framework + * REPLICA-CREATE for creating a replica of a chunk + * REPLICA-DELETE for deleting an existing replica of a chunk + * REPLICA-FIND for finding out if a replica is present, and reporting its state + * REPLICA-FIND-ALL for finding all replicas and reporting their states + * INDEX for extracting and returning a collection of the "director" index data + * SQL for executing various SQL statements against the worker's database + * REQUEST-TRACK for tracking status and retreiving results of the previously submitted request + * REQUEST-STATUS for checking the status of the previously submitted request + * REQUEST-STOP for stopping the previously submitted request + * REQUEST-DISPOSE for garbage collecting the request + * SERVICE-STATUS for checking the status of the worker replication service + * SERVICE-SUSPEND for suspending the worker replication service + * SERVICE-RESUME for resuming the worker replication service + * SERVICE-REQUESTS for listing the outstanding requests + * SERVICE-DRAIN for draining the worker replication service + * SERVICE-RECONFIG for reconfiguring the worker replication service + * + * @param serviceProvider The provider of services is needed to access + * the configuration and the database services. + * @param workerName The name of a worker this service is acting upon (used to pull + * worker-specific configuration options for the service). + * @param processor Request processor. + * @param req The HTTP request. + * @param resp The HTTP response channel. + * @param subModuleName The name of a submodule to be called. + * @param authType The authorization requirements for the module + * @throws std::invalid_argument for unknown values of parameter 'subModuleName' + */ + static void process(std::shared_ptr const& serviceProvider, + std::shared_ptr const& processor, std::string const& workerName, + httplib::Request const& req, httplib::Response& resp, + std::string const& subModuleName, + http::AuthType const authType = http::AuthType::NONE); + +protected: + virtual std::string context() const final; + virtual nlohmann::json executeImpl(std::string const& subModuleName) final; + +private: + WorkerHttpSvcMod(std::shared_ptr const& serviceProvider, + std::shared_ptr const& processor, std::string const& workerName, + httplib::Request const& req, httplib::Response& resp); + + /// Parse common parameters of the queued requests + /// @param func The name of the function to be used in the log messages + /// @return The parsed header + protocol::QueuedRequestHdr _parseHdr(std::string const& func) const; + + /// Process the ECHO request + nlohmann::json _echo() const; + + /// Process the REPLICA-CREATE request + nlohmann::json _replicaCreate(); + + /// Process the REPLICA-DELETE request + nlohmann::json _replicaDelete(); + + /// Process the REPLICA-FIND request + nlohmann::json _replicaFind(); + + /// Process the REPLICA-FIND-ALL request + nlohmann::json _replicaFindAll(); + + /// Process the INDEX request + nlohmann::json _index(); + + /// Process the SQL request + nlohmann::json _sql(); + + /// Process the REQUEST-TRACK request + nlohmann::json _requestTrack(); + + /// Process the REQUEST-STATUS request + nlohmann::json _requestStatus(); + + /// Process the REQUEST-STOP request + nlohmann::json _requestStop(); + + /// Process the REQUEST-DISPOSE request + nlohmann::json _requestDispose(); + + /// Process the SERVICE-SUSPEND request + nlohmann::json _serviceSuspend(); + + /// Process the SERVICE-RESUME request + nlohmann::json _serviceResume(); + + /// Process the SERVICE-STATUS request + nlohmann::json _serviceStatus(); + + /// Process the SERVICE-REQUESTS request + nlohmann::json _serviceRequests(); + + /// Process the SERVICE-DRAIN request + nlohmann::json _serviceDrain(); + + /// Process the SERVICE-RECONFIG request + nlohmann::json _serviceReconfig(); + + // Input parameters + std::shared_ptr const _serviceProvider; + std::shared_ptr _processor; + std::string const _workerName; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_WORKERHTTPSVCMOD_H diff --git a/src/replica/worker/WorkerSqlHttpRequest.cc b/src/replica/worker/WorkerSqlHttpRequest.cc new file mode 100644 index 0000000000..de92e5c1a3 --- /dev/null +++ b/src/replica/worker/WorkerSqlHttpRequest.cc @@ -0,0 +1,425 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ + +// Class header +#include "replica/worker/WorkerSqlHttpRequest.h" + +// System headers +#include +#include + +// Qserv headers +#include "replica/config/Configuration.h" +#include "replica/mysql/DatabaseMySQLUtils.h" +#include "replica/services/ServiceProvider.h" +#include "replica/util/Performance.h" +#include "replica/util/Mutex.h" + +// LSST headers +#include "lsst/log/Log.h" + +using namespace std; +using json = nlohmann::json; + +#define CONTEXT context("WorkerSqlHttpRequest", __func__) + +namespace { + +LOG_LOGGER _log = LOG_GET("lsst.qserv.replica.WorkerSqlHttpRequest"); + +} // namespace + +namespace lsst::qserv::replica { + +using namespace database::mysql; + +shared_ptr WorkerSqlHttpRequest::create( + shared_ptr const& serviceProvider, string const& worker, + protocol::QueuedRequestHdr const& hdr, json const& req, ExpirationCallbackType const& onExpired) { + auto ptr = shared_ptr( + new WorkerSqlHttpRequest(serviceProvider, worker, hdr, req, onExpired)); + ptr->init(); + return ptr; +} + +WorkerSqlHttpRequest::WorkerSqlHttpRequest(shared_ptr const& serviceProvider, + string const& worker, protocol::QueuedRequestHdr const& hdr, + json const& req, ExpirationCallbackType const& onExpired) + : WorkerHttpRequest(serviceProvider, worker, + "SQL:" + protocol::toString(protocol::SqlRequestType(req.at("type"))), hdr, req, + onExpired), + _sqlRequestType(req.at("type")), + _user(req.at("user")), + _password(req.at("password")), + _databaseInfo(serviceProvider->config()->databaseInfo(req.at("database"))), + _maxRows(req.at("max_rows")), + _batchMode(req.at("batch_mode")), + _resultSets(json::array()) { + // Parse the request-specific parameters. + switch (_sqlRequestType) { + case protocol::SqlRequestType::QUERY: + _query = req.at("query"); + break; + case protocol::SqlRequestType::CREATE_TABLE: + if (!_batchMode) _table = req.at("table"); + _engine = req.at("engine"); + _columns = replica::parseSqlColumns(req.at("columns")); + _partitionByColumn = req.at("partition_by_column"); + break; + case protocol::SqlRequestType::CREATE_TABLE_INDEX: + if (!_batchMode) _table = req.at("table"); + _index = SqlIndexDef(req.at("index")); + break; + case protocol::SqlRequestType::DROP_TABLE_PARTITION: + if (!_batchMode) _table = req.at("table"); + _transactionId = req.at("transaction_id"); + break; + case protocol::SqlRequestType::DROP_TABLE_INDEX: + if (!_batchMode) _table = req.at("table"); + _indexName = req.at("index_name"); + break; + case protocol::SqlRequestType::ALTER_TABLE: + if (!_batchMode) _table = req.at("table"); + _alterTableSpec = req.at("alter_spec"); + break; + default: + break; + } +} + +void WorkerSqlHttpRequest::getResult(json& result) const { + // No locking is needed here since the method is called only after + // the request is completed. + result["result_sets"] = _resultSets; +} + +bool WorkerSqlHttpRequest::execute() { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT); + + replica::Lock lock(_mtx, CONTEXT); + checkIfCancelling(lock, __func__); + + try { + // Pre-create the default result-set message before any operations with + // the database service. This is needed to report errors in method _reportFailure. + json& resultSet = _currentResultSet(lock, true); + + // Open the connection once and then manage transactions via + // the connection handlers down below to ensure no lingering transactions + // are left after the completion of the request's execution (whether it's + // successful or not). + auto const connection = _connector(); + + // Check if this is the "batch" request which involves executing + // a series of queries. This kind of requests needs to be processed + // slightly differently since we need to intercept and properly handle + // a few known (and somewhat expected) MySQL errors w/o aborting + // the whole request. + if (_batchMode) { + // Count the number of failures for proper error reporting on + // the current request. + size_t numFailures = 0; + bool first = true; + for (string const& table : _tables) { + // If this is the very first iteration of the loop then use + // the default result set created earlier. Otherwise create + // a new one. + if (exchange(first, false) == false) { + resultSet = _currentResultSet(lock, true); + } + resultSet["scope"] = table; + try { + ConnectionHandler const h(connection); + h.conn->execute([&](decltype(h.conn) const& conn_) { + conn_->begin(); + auto const query = _generateQuery(conn_, table); + if (query.mutexName.empty()) { + conn_->execute(query.query); + } else { + replica::Lock const lock(serviceProvider()->getNamedMutex(query.mutexName), + CONTEXT); + conn_->execute(query.query); + } + _extractResultSet(lock, conn_); + conn_->commit(); + }); + } catch (database::mysql::ER_NO_SUCH_TABLE_ const& ex) { + ++numFailures; + resultSet["status_ext"] = protocol::StatusExt::NO_SUCH_TABLE; + resultSet["status_ext_str"] = protocol::toString(protocol::StatusExt::NO_SUCH_TABLE); + resultSet["error"] = string(ex.what()); + } catch (database::mysql::ER_PARTITION_MGMT_ON_NONPARTITIONED_ const& ex) { + ++numFailures; + resultSet["status_ext"] = protocol::StatusExt::NOT_PARTITIONED_TABLE; + resultSet["status_ext_str"] = + protocol::toString(protocol::StatusExt::NOT_PARTITIONED_TABLE); + resultSet["error"] = string(ex.what()); + } catch (database::mysql::ER_DUP_KEYNAME_ const& ex) { + ++numFailures; + resultSet["status_ext"] = protocol::StatusExt::DUPLICATE_KEY; + resultSet["status_ext_str"] = protocol::toString(protocol::StatusExt::DUPLICATE_KEY); + resultSet["error"] = string(ex.what()); + } catch (database::mysql::ER_CANT_DROP_FIELD_OR_KEY_ const& ex) { + ++numFailures; + resultSet["status_ext"] = protocol::StatusExt::CANT_DROP_KEY; + resultSet["status_ext_str"] = protocol::toString(protocol::StatusExt::CANT_DROP_KEY); + resultSet["error"] = string(ex.what()); + } + } + if (numFailures > 0) { + setStatus(lock, protocol::Status::FAILED, protocol::StatusExt::MULTIPLE); + } else { + setStatus(lock, protocol::Status::SUCCESS); + } + } else { + // TODO: the algorithm will only report a result set of the last query + // from the multi-query collections. The implementations of the corresponding + // requests should take this into account. + ConnectionHandler const h(connection); + h.conn->execute([&](decltype(h.conn) const& conn_) { + conn_->begin(); + for (auto const& query : _queries(conn_)) { + if (query.mutexName.empty()) { + conn_->execute(query.query); + } else { + replica::Lock const lock(serviceProvider()->getNamedMutex(query.mutexName), CONTEXT); + conn_->execute(query.query); + } + _extractResultSet(lock, conn_); + } + conn_->commit(); + }); + setStatus(lock, protocol::Status::SUCCESS); + } + } catch (database::mysql::ER_NO_SUCH_TABLE_ const& ex) { + _reportFailure(lock, protocol::StatusExt::NO_SUCH_TABLE, ex.what()); + } catch (database::mysql::ER_PARTITION_MGMT_ON_NONPARTITIONED_ const& ex) { + _reportFailure(lock, protocol::StatusExt::NOT_PARTITIONED_TABLE, ex.what()); + } catch (database::mysql::ER_DUP_KEYNAME_ const& ex) { + _reportFailure(lock, protocol::StatusExt::DUPLICATE_KEY, ex.what()); + } catch (database::mysql::ER_CANT_DROP_FIELD_OR_KEY_ const& ex) { + _reportFailure(lock, protocol::StatusExt::CANT_DROP_KEY, ex.what()); + } catch (database::mysql::Error const& ex) { + _reportFailure(lock, protocol::StatusExt::MYSQL_ERROR, ex.what()); + } catch (invalid_argument const& ex) { + _reportFailure(lock, protocol::StatusExt::INVALID_PARAM, ex.what()); + } catch (out_of_range const& ex) { + _reportFailure(lock, protocol::StatusExt::LARGE_RESULT, ex.what()); + } catch (exception const& ex) { + _reportFailure(lock, protocol::StatusExt::OTHER_EXCEPTION, ex.what()); + } + return true; +} + +Connection::Ptr WorkerSqlHttpRequest::_connector() const { + // A choice of credential for connecting to the database service depends + // on a type of the request. For the sake of greater security, arbitrary + // queries require a client to explicitly provide the credentials. + // Otherwise, using credentials from the worker's configuration. + bool const clientCredentials = _sqlRequestType == protocol::SqlRequestType::QUERY; + auto connectionParams = Configuration::qservWorkerDbParams(); + if (clientCredentials) { + connectionParams.user = _user; + connectionParams.password = _password; + } + return Connection::open(connectionParams); +} + +vector WorkerSqlHttpRequest::_queries(Connection::Ptr const& conn) const { + QueryGenerator const g(conn); + vector queries; + switch (_sqlRequestType) { + case protocol::SqlRequestType::QUERY: + queries.emplace_back(Query(_query)); + break; + case protocol::SqlRequestType::CREATE_DATABASE: { + bool const ifNotExists = true; + string const query = g.createDb(_databaseInfo.name, ifNotExists); + queries.emplace_back(Query(query)); + break; + } + case protocol::SqlRequestType::DROP_DATABASE: { + bool const ifExists = true; + string const query = g.dropDb(_databaseInfo.name, ifExists); + queries.emplace_back(Query(query)); + break; + } + case protocol::SqlRequestType::ENABLE_DATABASE: { + // Using REPLACE instead of INSERT to avoid hitting the DUPLICATE KEY error + // if such entry already exists in the table. + string const query = g.replace("qservw_worker", "Dbs", _databaseInfo.name); + queries.emplace_back(Query(query)); + break; + } + case protocol::SqlRequestType::DISABLE_DATABASE: { + string const where = g.where(g.eq("db", _databaseInfo.name)); + queries.emplace_back(Query(g.delete_(g.id("qservw_worker", "Chunks")) + where)); + queries.emplace_back(Query(g.delete_(g.id("qservw_worker", "Dbs")) + where)); + break; + } + case protocol::SqlRequestType::GRANT_ACCESS: { + string const query = g.grant("ALL", _databaseInfo.name, _user, "localhost"); + queries.emplace_back(Query(query)); + break; + } + default: + // The remaining remaining types of requests require the name of a table + // affected by the operation. + queries.emplace_back(_generateQuery(conn, _table)); + break; + } + return queries; +} + +Query WorkerSqlHttpRequest::_generateQuery(Connection::Ptr const& conn, string const& table) const { + QueryGenerator const g(conn); + SqlId const databaseTable = g.id(_databaseInfo.name, table); + switch (_sqlRequestType) { + case protocol::SqlRequestType::CREATE_TABLE: { + list const keys; + bool const ifNotExists = true; + string query = g.createTable(databaseTable, ifNotExists, _columns, keys, _engine); + + // If MySQL partitioning was requested for the table then configure partitioning + // parameters and add the initial partition corresponding to the default + // transaction identifier. The table will be partitioned based on values of + // the transaction identifiers in the specified column. + string const partitionByColumn = _partitionByColumn; + if (!partitionByColumn.empty()) { + TransactionId const defaultTransactionId = 0; + query += g.partitionByList(partitionByColumn) + g.partition(defaultTransactionId); + } + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::DROP_TABLE: { + bool const ifExists = true; + string const query = g.dropTable(databaseTable, ifExists); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::DROP_TABLE_PARTITION: { + bool const ifExists = true; + string const query = g.alterTable(databaseTable) + g.dropPartition(_transactionId, ifExists); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::REMOVE_TABLE_PARTITIONING: { + string const query = g.alterTable(databaseTable) + g.removePartitioning(); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::CREATE_TABLE_INDEX: { + bool const ifNotExists = true; + string const query = g.createIndex(databaseTable, _index.name, _index.spec, _index.keys, + ifNotExists, _index.comment); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::DROP_TABLE_INDEX: { + bool const ifExists = true; + string const query = g.dropIndex(databaseTable, _indexName, ifExists); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::GET_TABLE_INDEX: { + return Query(g.showIndexes(databaseTable)); + } + case protocol::SqlRequestType::ALTER_TABLE: { + string const query = g.alterTable(databaseTable, _alterTableSpec); + return Query(query, databaseTable.str); + } + case protocol::SqlRequestType::TABLE_ROW_STATS: { + // The transaction identifier column is not required to be present in + // the legacy catalogs (ingested w/o super-transactions), or in (the narrow) tables + // in which the column was removed to save disk space. The query generator + // implemented below accounts for this scenario by consulting MySQL's + // information schema. If the column isn't present then the default transaction + // identifier 0 will be injected into the result set. + string query = g.select(Sql::COUNT_STAR) + + g.from(DoNotProcess(g.id("information_schema", "COLUMNS"))) + + g.where(g.eq("TABLE_SCHEMA", _databaseInfo.name), g.eq("TABLE_NAME", table), + g.eq("COLUMN_NAME", "qserv_trans_id")); + int count = 0; + selectSingleValue(conn, query, count); + if (count == 0) { + string const query = + g.select(g.as(g.val(0), "qserv_trans_id"), g.as(Sql::COUNT_STAR, "num_rows")) + + g.from(DoNotProcess(databaseTable)); + return Query(query); + } + query = g.select("qserv_trans_id", g.as(Sql::COUNT_STAR, "num_rows")) + + g.from(DoNotProcess(databaseTable)) + g.groupBy("qserv_trans_id"); + return Query(query); + } + default: + throw invalid_argument( + CONTEXT + " not the table-scope request type: " + protocol::toString(_sqlRequestType)); + } +} + +void WorkerSqlHttpRequest::_extractResultSet(replica::Lock const& lock, Connection::Ptr const& conn) { + LOGS(_log, LOG_LVL_DEBUG, CONTEXT); + + json& resultSet = _currentResultSet(lock); + + // This will explicitly reset the default failure mode as it was + // initialized by the constructor of the result set class. + resultSet["status_ext"] = protocol::StatusExt::NONE; + resultSet["status_ext_str"] = protocol::toString(protocol::StatusExt::NONE); + + // Now carry over the actual rest set (if any) + resultSet["char_set_name"] = conn->charSetName(); + resultSet["has_result"] = conn->hasResult() ? 1 : 0; + if (conn->hasResult()) { + resultSet["fields"] = conn->fieldsToJson(); + resultSet["rows"] = json::array(); + json& rowsJson = resultSet["rows"]; + size_t numRowsProcessed = 0; + Row row; + while (conn->next(row)) { + if (_maxRows != 0) { + if (numRowsProcessed >= _maxRows) { + throw out_of_range(CONTEXT + " max_rows=" + to_string(_maxRows) + " limit exceeded"); + } + ++numRowsProcessed; + } + rowsJson.push_back(row.toJson()); + } + } +} + +void WorkerSqlHttpRequest::_reportFailure(replica::Lock const& lock, protocol::StatusExt statusExt, + string const& error) { + LOGS(_log, LOG_LVL_ERROR, CONTEXT << " exception: " << error); + + // Note that the actual reason for a query to fail is recorded in its + // result set, while the final state of the whole request may vary + // depending on a kind of the request - if it's a simple or the "batch" + // request. + json& resultSet = _currentResultSet(lock); + resultSet["status_ext"] = statusExt; + resultSet["status_ext_str"] = protocol::toString(statusExt); + resultSet["error"] = error; + setStatus(lock, protocol::Status::FAILED, _batchMode ? statusExt : protocol::StatusExt::MULTIPLE); +} + +json& WorkerSqlHttpRequest::_currentResultSet(replica::Lock const& lock, bool create) { + if (create) _resultSets.push_back(json::object()); + if (_resultSets.size() != 0) return _resultSets.back(); + throw logic_error(CONTEXT + " the operation is not allowed in this state"); +} + +} // namespace lsst::qserv::replica diff --git a/src/replica/worker/WorkerSqlHttpRequest.h b/src/replica/worker/WorkerSqlHttpRequest.h new file mode 100644 index 0000000000..3db660d016 --- /dev/null +++ b/src/replica/worker/WorkerSqlHttpRequest.h @@ -0,0 +1,183 @@ +/* + * LSST Data Management System + * + * This product includes software developed by the + * LSST Project (http://www.lsst.org/). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the LSST License Statement and + * the GNU General Public License along with this program. If not, + * see . + */ +#ifndef LSST_QSERV_REPLICA_WORKERSQLHTTPREQUEST_H +#define LSST_QSERV_REPLICA_WORKERSQLHTTPREQUEST_H + +// System headers +#include +#include +#include + +// Third party headers +#include "nlohmann/json.hpp" + +// Qserv headers +#include "replica/config/ConfigDatabase.h" +#include "replica/mysql/DatabaseMySQL.h" +#include "replica/proto/Protocol.h" +#include "replica/util/Common.h" +#include "replica/worker/WorkerHttpRequest.h" + +// Forward declarations +namespace lsst::qserv::replica { +class ServiceProvider; +} // namespace lsst::qserv::replica + +namespace lsst::qserv::replica::database::mysql { +class Connection; +} // namespace lsst::qserv::replica::database::mysql + +// This header declarations +namespace lsst::qserv::replica { + +/** + * Class WorkerSqlHttpRequest executes queries against the worker database + * and return results sets (if any) back to a caller. + * + * @note Queries passed into this operation are supposed to be well formed. + * If a MySQL error would occur during an attempt to execute an incorrectly + * formed query then the corresponding MySQL error will be recorded + * and reported to a caller in the response structure which is set + * by method WorkerSqlHttpRequest::setInfo(). + */ +class WorkerSqlHttpRequest : public WorkerHttpRequest { +public: + /** + * Static factory method is needed to prevent issue with the lifespan + * and memory management of instances created otherwise (as values or via + * low-level pointers). + * + * @param serviceProvider provider is needed to access the Configuration + * of a setup and for validating the input parameters + * @param worker the name of a worker. The name must match the worker which + * is going to execute the request. + * @param hdr request header (common parameters of the queued request) + * @param req the request object received from a client (request-specific parameters) + * @param onExpired request expiration callback function + * @return pointer to the created object + */ + static std::shared_ptr create( + std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + WorkerSqlHttpRequest() = delete; + WorkerSqlHttpRequest(WorkerSqlHttpRequest const&) = delete; + WorkerSqlHttpRequest& operator=(WorkerSqlHttpRequest const&) = delete; + + ~WorkerSqlHttpRequest() override = default; + + bool execute() override; + +protected: + void getResult(nlohmann::json& result) const override; + +private: + WorkerSqlHttpRequest(std::shared_ptr const& serviceProvider, std::string const& worker, + protocol::QueuedRequestHdr const& hdr, nlohmann::json const& req, + ExpirationCallbackType const& onExpired); + + /// @return A connector as per the input request + std::shared_ptr _connector() const; + + /** + * The query generator for simple requests uses parameters of a request + * to compose a collection of desired queries. + * @note this method is capable of generating a single or multiple queries + * as needed by the corresponding non-batch requests. + * @param conn A reference to the database connector is needed to process + * arguments to meet requirements of the database query processing engine. + * @return A collection of queries to be executed as per the input request. + * @throw std::invalid_argument For unsupported requests types supported. + */ + std::vector _queries(std::shared_ptr const& conn) const; + + /** + * The query generator for queries which have a target table. + * @param conn A reference to the database connector is needed to process + * arguments to meet requirements of the database query processing engine. + * @param table The name of table affected by the query. + * @return A query as per the input request and the name of a table. + * @throw std::invalid_argument For unsupported requests types. + */ + Query _generateQuery(std::shared_ptr const& conn, + std::string const& table) const; + + /** + * Extract a result set (if any) via the database connector into + * the Protobuf response object. + * @param lock The lock must be held before calling the method since it's + * going to access a protected state of the object. + * @param conn a valid database connector for extracting a result set + */ + void _extractResultSet(replica::Lock const& lock, + std::shared_ptr const& conn); + + /** + * Report & record a failure + * + * @param lock The lock must be held before calling the method since it's + * going to modify a protected state of the object. + * @param statusExt An extended status to be reported to Controllers and + * set in the current (most recently processed query if any) result set. + * @param error A message to be logged and returned to Controllers. + * @throw std::logic_error Is thrown when the method is called before + * creating a result set. + */ + void _reportFailure(replica::Lock const& lock, protocol::StatusExt statusExt, std::string const& error); + + /// @param lock The lock must be held before calling the method since it's + /// going to modify a protected state of the object. + /// @param create A flag to indicate if a new result set should be created + /// @return A mutable pointer to the current result set + nlohmann::json& _currentResultSet(replica::Lock const& lock, bool create = false); + + // Input parameters (mandatory) + + protocol::SqlRequestType const _sqlRequestType; ///< The type of the SQL request + std::string const _user; ///< The name of the MySQL user (queries or grants) + std::string const _password; ///< The MySQL password for the user account (queries only) + DatabaseInfo const _databaseInfo; ///< Database descriptor obtained from the Configuration + std::size_t const _maxRows; ///< The maximum number of rows to be returned in a result set + + // Input parameters (of batch nmode requested) + bool const _batchMode; ///< A flag to indicate if the request is targeting many tables + std::vector _tables; ///< A list of tables to be affected by the request + + // Input parameters (request-specific, see the constructor for further details) + + std::string _query; ///< The query to be executed + std::string _table; ///< The name of the table to be affected by the request + std::list _columns; ///< The list of columns for a table to be created + std::string _partitionByColumn; ///< The name of the column to be used for partitioning + SqlIndexDef _index; ///< The index definition + std::string _engine; ///< The name of the table engine to be used + TransactionId _transactionId; ///< The transaction identifier + std::string _indexName; ///< The name of the index to be dropped + std::string _alterTableSpec; ///< The specification for the ALTER TABLE request + + /// Cached result to be sent to a client upon a request + nlohmann::json _resultSets; +}; + +} // namespace lsst::qserv::replica + +#endif // LSST_QSERV_REPLICA_WORKERSQLHTTPREQUEST_H From 58b46830cd8d535d45c0f5a49a407e3c2da26adc Mon Sep 17 00:00:00 2001 From: Igor Gaponenko Date: Wed, 18 Dec 2024 11:31:48 -0800 Subject: [PATCH 9/9] Documentation on the http-based Worker Replication service --- doc/dev/api/TODO.rst | 34 + doc/dev/api/index.rst | 17 + doc/dev/api/introduction.rst | 22 + doc/dev/api/repl-worker.rst | 828 ++++++++++++++++++++++ doc/dev/index.rst | 1 + doc/ingest/api/reference/rest/general.rst | 2 + 6 files changed, 904 insertions(+) create mode 100644 doc/dev/api/TODO.rst create mode 100644 doc/dev/api/index.rst create mode 100644 doc/dev/api/introduction.rst create mode 100644 doc/dev/api/repl-worker.rst diff --git a/doc/dev/api/TODO.rst b/doc/dev/api/TODO.rst new file mode 100644 index 0000000000..964421435d --- /dev/null +++ b/doc/dev/api/TODO.rst @@ -0,0 +1,34 @@ + +TODO +---- + + +Finish in a scope of the current ticket DM-42005 before the X-Mas break: + +- [**x**] Think about the locking mechanism of the method WorkerHttpRequest::toJson(). The method + acquires a lock on the mutext while the request may too have a lock on the same mutex + while processing the request in WorkerHttpRequest::execute(). This may result in a deadlock. + Perhaps no locking is needed as all since the resulting data are not lock sencitive? +- [**x**] Finish implementing a hierachy of the HTTP-based worker requests +- [**x**] Finish implementing the request processor for these requests +- [**x**] Add the new service to the Condfiguration and Registry to allow the Controller to send requests + to the worker via HTTP +- [**x**] Display connection parameters of the new service on the Web Dashboard +- [ ] Document the REST services in the documentation tree. +- [ ] Manually test the new implementation externally using ``curl`` or Python's ``requests`` module. + Think about the test cases to cover the new implementation. +- [ ] Extend the integration tests to cover the new implementation. + +Finish in a scope of a separate ticket during/after the X-Mas break: + +- [ ] Implement the MessengerHttp on the Controller side of the protocol. The class will + be providing the multiplexing API for the Controller to send requests to the worker. + The initial implementation will be based on the simple http::AsyncReq. +- [ ] Create a parallel hierarchy of the HTTP-based request & job classes on the Controller + side of the protocol. +- [ ] Test the new classes. +- [ ] Implement the MessengerHttp to reuse the socket connections for sending multiple requests + to the same worker. +- [ ] Test the new implementation to ensure it works the same way as the old one. +- [ ] Remove the old implementation of the Controller - Worker protocol. + diff --git a/doc/dev/api/index.rst b/doc/dev/api/index.rst new file mode 100644 index 0000000000..49c4f4fda9 --- /dev/null +++ b/doc/dev/api/index.rst @@ -0,0 +1,17 @@ +.. note:: + + Information in this guide corresponds to the version **40** of the Qserv REST API. Keep in mind + that each implementation of the API has a specific version. The version number will change + if any changes to the implementation or the API that might affect users will be made. + The current document will be kept updated to reflect the latest version of the API. + +############################## +The internal REST API of Qserv +############################## + +.. toctree:: + :maxdepth: 4 + + introduction + repl-worker + TODO diff --git a/doc/dev/api/introduction.rst b/doc/dev/api/introduction.rst new file mode 100644 index 0000000000..85a6f15f54 --- /dev/null +++ b/doc/dev/api/introduction.rst @@ -0,0 +1,22 @@ +.. _qserv-api-introduction: + +Introduction +============ + +The Qserv REST API is a collection of RESTful web services that provide access to various components of the Qserv system. +The API enforces a specific interaction model between the client and the server. The following highlights are worth mentioning: + +- All ``POST``, ``PUT`` and ``DELETE`` requests must be accompanied by a JSON payload. +- Responses of all but a few select services are in JSON format. Exceptions are documented in the API documentation. +- Schemas of the JSON requests and payloads are defined in the API documentation. +- The API is versioned. The version number is included in the URL path of the ``GET`` requests, and it's + included into the JSON payload of the ``POST``, ``PUT`` and ``DELETE`` requests. +- Critical API services are protected by an authentication mechanism. The client must provide a valid + authentication token in the JSON payload of the ``POST``, ``PUT`` and ``DELETE`` requests. + No authentication is required for the ``GET`` requests. + +The general information on the structure of the API can be found in the following document: + +- :ref:`ingest-general` + +The rest of the current document provides detailed information on the individual services that are available in the Qserv API. diff --git a/doc/dev/api/repl-worker.rst b/doc/dev/api/repl-worker.rst new file mode 100644 index 0000000000..a8c7bf0688 --- /dev/null +++ b/doc/dev/api/repl-worker.rst @@ -0,0 +1,828 @@ +.. _qserv-api-repl-worker: + +Replication Worker Services +=========================== + +Scope +----- + +This document describes the Replication worker services in Qserv. The worker services are responsible for +processing requests that are submitted by the Replication Controller. The protocol is based on HTTP/JSON. + + +Categories +---------- + +There are two general categories of requests depending on the request processing mechanism. The first group includes +requests that are processed *asynchronously*. Once a request of this type is validated and accepted by the service, +it's put into a priority queue of the Replication worker in an order which depends on the *priority* level specific +in the request body. These requests also have an expiration timeout. If the request is not processed within the timeout, +it will be automatically cancelled by the server. Another important attribute of the queued requests is their unique identifier. +The identifier is generated by the Controller and it's used to track the request status and to retrieve the results of +the request once it's finished. The following request types belong to this category: + +- :ref:`qserv-api-repl-worker-tests` (echo, etc.) +- :ref:`qserv-api-repl-worker-replica-management` (find, create, delete replicas) +- :ref:`qserv-api-repl-worker-sql` (SQL operations) + +The second group includes requests processed immediately (*synchronously*) by the services: + +- :ref:`qserv-api-repl-worker-request-management` (inspect and manage queued requests) +- :ref:`qserv-api-repl-worker-service-management` (inspect and manage the worker server) + +Request parameters +------------------ + +Even though the parameters of the requests are specific to the request type, there are many attributes that are common +to all requests. These attributes are sent with all request types as explained in the subsections below. + +Attributes sent with all request types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following attributes are passed with any request type, regardless of the HTTP method: + +``instance_id`` : *string* + The mandatory identifier of a Qserv instance served by the Replication System. +``version`` : *number* = ``0`` + The optional version of the Qserv REST API. See :ref:`ingest-general-versioning` for more information on the API versioning. + +For the ``POST``, ``PUT`` and ``DELETE`` request types values of the attributes are send in the request body: + +.. code-block:: + + { + "instance_id" : , + "version" : + } + +For ``GET`` request types, parameters are specified in the URL query: + +.. code-block:: + + ?instance_id=&version= + +Authorization of the ``POST``, ``PUT`` and ``DELETE`` requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All requests that may modify the persistent state of Qserv or affect the state of the worker service must use the key-based +authentication mechanism: + +``auth_key`` : *string* + The required authentication key to access the service. The key must match the key set in the configuration of + the target worker service. + +This attribute's value is sent in the request body: + +.. code-block:: + + { + "auth_key" : + } + +.. _qserv-api-repl-worker-queued-request-params: + +JSON body of the queued requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All requests that are queued by the worker service before processed by the service are sent with the ``POST`` method. +The request body of each such request is a JSON object with the following **required** attributes: + +.. code-block:: + + { + "id" : , + "timeout" : , + "priority" : , + "req" : { + ... + } + } + +Where: + +``id`` : *string* + The unique ID of the request (generated by the Controller). +``timeout`` : *number* + The optional request expiration timeout. The timeout is meant for automatic cancelling/disposing requests + regardless of their statuses. The timeout is expressed in seconds since the *UNIX Epoch*. For requests where + the timeout is not set or where its value was set to ``0`` the worker-specific default value will be used. +``priority`` : *number* + The priority level of the request. Requests with higher priority levels are processed before the requests with + lower priority levels. +``req`` : *object* + The object containing the request-specific attributes. The schema of this object depends on the type of the request. + The payload of the object is documented in the relevant sections below. + + +Response objects +---------------- + +Responses returned by all worker services have the following attributes wich represent the common completion status +of the request: + +.. code-block:: + + { + "success" : , + "error" : , + "error_ext" : , + "warning" : , + + "status" : , + "status_str" : , + "status_ext" : , + "status_ext_str" : + } + +The first group represents the common attributes which are reported by all REST services in Qserv: + +``success`` : *number* + The completion status of the request. The value of ``1`` means that the request was successfully accepted + by the service after evaluating the input parameters and a context of the request. The value of ``0`` indicates + any problems with the operation. The error message will be provided in the ``error`` attribute. +``error`` : *string* + The error message in case of the failed request. +``error_ext`` : *object* + The extended error message in case of the failed request. +``warning`` : *string* + The optional warning message that may be posted in case of the successful request. + + +The second group of attributes represent the completion status codes which are specific to the worker services. These attributes +should be used only when ``success=1``: + +``status`` : *int* + The completion status of the operation. Values are defined in the C++ ``enum`` type ``protocol::Status``. +``status_str`` : *string* + The human readable representation of the above-defined completion status of the operation. +``status_ext`` : *int* + Extended status of this operation. Values are defined in the C++ ``enum`` type ``protocol::StatusExt::Status``. +``status_ext_str`` : *string* + The human readable representation of the above-defined completion status of the operation. + +Additional attributes returned by specific request types are documented in the relevant sections below. + +.. _qserv-api-repl-worker-response-queued: + +Responses of the queued requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + Responses of :ref:`qserv-api-repl-worker-request-management` requests adhere to the same schema as the queued requests. + +All responses have the following schema: + +.. code-block:: + + { + "id" : , + "priority" : , + "timeout" : , + "req" : , + + "type" : , + "expiration_timeout_sec" : , + + "performance" : { + "receive_time" : , + "start_time" : , + "finish_time" : + } + "result" : + } + + +The first group represents parameters of the original request (or a target request in case if the request management services +were called on the previously submitted queued requests): + +``id`` : *string* + The unique ID of the request (generated by the Controller). +``priority`` : *int* + The priority level of the request. +``timeout`` : *int* + The request expiration timeout that was passed in the original request (applies to the queued requests only). + The timeout is meant for automatic cancelling/disposing requests regardless of their statuses. The timeout is expressed + in seconds since the UNIX Epoch. For requests where the timeout is not set or where its value was set to ``0`` + the worker-specific default value will be used. The adjusted (effective) value of the timeout is reported in + the ``expiration_timeout_sec`` attribute. +``req`` : *object* + The original request object as it was received by the worker. + +The following attributes are assigned to a request by the worker server based on a nature of the request and its processing +context: + +``type`` : *string* + The type of the request. +``expiration_timeout_sec``: *int* + The effective expiration timeout of the request in seconds. + +The actual processing status of the request is reported in the following attributes: + +``performance`` : *object* + The current performance metrics of the request. Values of these parameters changes during request processing + before the request is finished. There are tree attributes in this object: + + - ``receive_time`` : *uint64_t* + When the request was received by a worker service (milliseconds since UNIX Epoch). A non-zero value + is guaranteeded for all requests that were received by the worker service. + + - ``start_time`` : *uint64_t* + When the request was started by a worker service (milliseconds since UNIX Epoch). A value of ``0`` + means that the request is still in the processing queue. + + - ``finish_time`` : *uint64_t* + When the request was finished by a worker service (milliseconds since UNIX Epoch). A value of ``0`` + means that the request is still in the processing queue or it's still being processed. + +``result`` : *object* + The result of the request. The schema of this object depends on the type of the request. Also note that + the payload of the ``result`` object depends on the type of the request: + + - The object is empty for all newely submitted requests that ended up in the processing queue and for requests which + are still being processed. + - The object is filled with the relevant data for the requests that is finished or failed. + + + +.. _qserv-api-repl-worker-response-service: + + +Responses of the service management requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TBC... + + + + +Request types +------------- + +.. _qserv-api-repl-worker-tests: + +Tests +^^^^^ + +.. note:: + + This group of request belongs the *queued* category. Requests of this type are processed by the worker service + asynchronously by a dedicated pool of the worker threads. Parameters of this request type are sent in the request + body as a JSON object. A schema of the request object is documented in :ref:`qserv-api-repl-worker-queued-request-params`. + A schema of the response object is documented in: :ref:`qserv-api-repl-worker-response-queued`. + +Echo +~~~~ + +The Controller sends a POST request to the Replication worker to test the functionality of the worker processor +and to simulate the request submission/processing path. The ``echo`` request has no persistent side effects, such +as changes to the worker databases. Parameters of the request will be evaluated by the service. If all looks +okay then the request will be queued for processing. Otherwise, the service will return an error. + +.. code-block:: + + POST /worker/echo + +These are the the request-specific attributes: + +.. code-block:: + + "req" : { + "delay" : , + "data" : + } + +Where: + +``delay`` : *int* + The delay in milliseconds before the response is sent back. +``data`` : *string* + The data to be echoed back. + +The schema of the ``result`` object in the responses pf the succesfully completed requests is presented below: + +.. code-block:: + + "results" : { + "data" : + } + +Example of the complete request object for the ``echo`` request that is meanto to finish in 10000 milliseconds: + +.. code-block:: + + { + "instance_id": "qserv_proj", + "version": 40, + "auth_key": "replauthkey", + + "id": "1234567", + "req": { + "data": "abcdefg", + "delay": 10000 + } + } + +The request could be sent as: + +.. code-block:: + + curl 'http://127.0.0.1:25005/worker/echo' \ + -X POST \ + -H 'Content-Type: application/json' \ + -d'{"instance_id": "qserv_proj", "version": 40, ...}' + +The response object will be: + +.. code-block:: + + { + "success": 1, + "error": "", + "error_ext": {}, + "warning": "", + + "status": 0, + "status_ext": 0, + "status_ext_str": "NONE", + "status_str": "CREATED", + + "id": "1234567", + "timeout": 0, + + "req": { + "data": "abcdefg", + "delay": 10000 + }, + + "type": "TEST_ECHO", + "expiration_timeout_sec": 28800, + "performance": { + "receive_time": 1739331676130, + "start_time": 0, + "finish_time": 0 + }, + "priority": 0, + "result": {} + } + +Note that the result object is still empty at this point because the request is still being processed. +Results of the completed requests can be obtained by tracking the request with the ID ``1234567`` as +explained in: + +- **TODO**: link to the request tracking service + +.. _qserv-api-repl-worker-replica-management: + +Replica management +^^^^^^^^^^^^^^^^^^ + + + +.. note:: + + This group of request belongs the *queued* category. Requests of this type are processed by the worker service + asynchronously by a dedicated pool of the worker threads. Parameters of this request type are sent in the request + body as a JSON object. A schema of the request object is documented in :ref:`qserv-api-repl-worker-queued-request-params`. + A schema of the response object is documented in: :ref:`qserv-api-repl-worker-response-queued`. + +TBC + +.. _qserv-api-repl-worker-sql: + +Database management +^^^^^^^^^^^^^^^^^^^ + +.. note:: + + This group of request belongs the *queued* category. Requests of this type are processed by the worker service + asynchronously by a dedicated pool of the worker threads. Parameters of this request type are sent in the request + body as a JSON object. A schema of the request object is documented in :ref:`qserv-api-repl-worker-queued-request-params`. + A schema of the response object is documented in: :ref:`qserv-api-repl-worker-response-queued`. + +TBC... + + +.. _qserv-api-repl-worker-request-management: + +Request management +^^^^^^^^^^^^^^^^^^ + +.. note:: + + This group of the **synchronous** requests are meant to monitor and manage the corresponding *queued* requests (the "target" requests). + Requests of this type are processed by the worker service instanteniously. Depending on the HTTP method, parameters of this request type + are sent either in in the request body as a JSON object or in the query string of the request URL. + A schema of the response object is documented in: :ref:`qserv-api-repl-worker-response-queued`. + +TBC... + +.. _qserv-api-repl-worker-service-management: + +Worker service management +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + This group of the **synchronous** requests are meant to monitor and manage the worker server itself. + Requests of this type are processed by the worker service instanteniously. Depending on the HTTP method, parameters of this request type + are sent either in in the request body as a JSON object or in the query string of the request URL. + A schema of the response object is documented in: :ref:`qserv-api-repl-worker-response-service`. + +TBC... + + + + + + + + + + + + + + + + +Replica management/information requests +--------------------------------------- + +All requests of this category are queued and processed by a dedicated pool of the worker threads. +Once the request is submitted and the worker service indicated that the request looked good, the state +of the request it can be further managed via: + +- TODO: link to the request tracking service +- TODO: link to the replica status service +- TODO: link to the replica cancel service + +Schemas of the response object for the successfully completed request varies depending on a type of a request. + +Schemas for the single replica requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The object has the following attributes: + +.. code-block:: + + { + "result" : { + "replica_info" : { + // enum ReplicaStatus { + // NOT_FOUND = 0; + // CORRUPT = 1; + // INCOMPLETE = 2; + // COMPLETE = 3; + // } + "status" : , // The status of the replica. Values corresponds to enums in class "ReplicaStatus" + "worker" : , // The worker ID + "database" : , + "chunk" : , + + // A collection of files + // + "file_info_many" : [ + + { + "name" : , // The name of a file + "size" : , // Size in bytes + "cs" : , // Control sum (if available) + "mtime" : , // The file content modification time in seconds (since UNIX Epoch) + + // The following parameters are set in the relevant contexts only. + // Otherwise they'll be set to some default value. + + "begin_transfer_time" : , // When the file migration started (where applies) [=0] + "end_transfer_time" : , // When the file migration finished (where applies) [=0] + "in_size" : // The size of an input file (where applies) [=0] + }, + ], + "verify_time" : // When the replica status was verified by a worker + } + } + } + +Schemas for the multi-replica requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + Presently, the only multi-replica request in tis category is the ``find-all`` request. + +The response object has the following attributes: + +.. code-block:: + + { + "result" : { + "replica_info_many" : [ + ... + ] + } + } + +Where each array entry is an object that has a single replica schema (``replica_info``) as described above for +the signle-replica requests. + + +Create a new chunk replica +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a POST request to the Replication worker to initiate the replica replica creation +operation on the target worker. Parameters of the request will be evaluated by the service. If all looks +okay then the request will be queued for processing. Otherwise, the service will return an error. + +.. code-block: + + POST /worker/replica/create + +The request-specific attributes: + +.. code-block:: + + { + "req" : { + "database" : , + "chunk" : , + "src_worker" : , // The source worker ID from where to pull the replica + "src_worker_host" : , // The source worker host (DNS or IP) + "src_worker_port" : // The source worker port + } + } + +Delete an existing chunk replica +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a POST request to the Replication worker to initiate the replica deletion +operation on the target worker. Parameters of the request will be evaluated by the service. If all looks +okay then the request will be queued for processing. Otherwise, the service will return an error. + +.. code-block: + + POST /worker/replica/delete + +The request-specific attributes: + +.. code-block:: + + { + "req" : { + "database" : , + "chunk" : + } + } + +Find info an existing chunk replica +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a POST request to the Replication worker to locate and report a status of a single chunk replica: + +.. code-block: + + POST /worker/replica/find + +The request-specific attributes: + +.. code-block:: + + { + "req" : { + "database" : , + "chunk" : , + "compute_cs" : // Compute the control sum of the replica files if not 0 + } + } + +Find info on all existing chunk replicas if a database +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a POST request to the Replication worker to locate and report a status of all chunk replicas +in a given database: + +.. code-block: + + POST /worker/replica/find-all + +The request-specific attributes: + +.. code-block:: + + { + "req" : { + "database" : + } + } + +Database management (SQL) Requests +---------------------------------- + + +Management requests +------------------- + +Tracking requests +^^^^^^^^^^^^^^^^^ + +The Controller sends a GET request to the Replication worker to track the status of the previously made +request and to retrieve results of the request if it's finished. The request URL should contain the unique +identifier ``id`` of the target request: + +.. code-block: + + GET /worker/request/track/:id + +In case of the successful request completion, the response object will not be empty and it will contain +the results of the request: + +.. code-block:: + + { + "result" : { + ... + } + } + +Retreiving request status +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a GET request to the Replication worker to get the status of the previously made +request. The request URL should contain the ID of the unique +identifier ``id`` of the target request: + +.. code-block: + + GET /worker/request/status/:id + +Note, that unlike the ``track`` request, the ``status`` request does not return the results of the request. +The result object will be present but it will be empty: + +.. code-block:: + + { + "result" : {} + } + +Stopping/cancelling requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a PUT request to the Replication worker to stop the previously made request: + +.. code-block: + + PUT /worker/request/stop/:id + +There are no request-specific attributes in the request object. + +Note, that unlike the ``track`` request, the ``stop`` request does not return the results of the request. +The result object will be present but it will be empty: + +.. code-block:: + + { + "result" : {} + } + +Disposing completed requests +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is a special request that's meant to be used by the Controller to dispose the completed +requests from the worker's internal storage. The request is sent as a POST request: + +.. code-block: + + POST /worker/request/dispose + +Where the request object is required to provide a collection (array) of the request IDs to be disposed: + +.. code-block:: + + { + "req" : { + "ids" : [ + , + ... + + ] + } + } + +The response object will have the completion status of the operation for each identifier mentioned in the request: + +.. code-block:: + + { + "result" : { + "ids_disposed" : { + : , + ... + : + } + } + } + +Where the value of the integer is the completion status of the operation. The value of ``1`` means that the request +was disposed successfully. The value of ``0`` means that the request was not found in a collection of the completed +requests + +Worker service management requests +---------------------------------- + +Requests in this category are meant to provide the Controller with the information on the worker service itself. +There are the following requests in this category: + +- TODO: link to: Get the worker status +- TODO: link to: Get info on requests at various stages of processing +- TODO: link to: Suspend the worker service +- TODO: link to: Resume the worker service +- TODO: link to: Drain requests at the worker service +- TODO: link to: Reconfigure the worker service + +The request-specific attributes are not required for these requests. + +Response objects of all service management requests have the following schema: + +.. code-block:: + + { + "status" : , // The completion status of the operation. Values corresponds to protocol::Status + "status_ext" : , // Extended status of this operation. Values corresponds to protocol::StatusExt [=NONE] + + "service_state" : , // The state of the worker service as defined in protocol::ServiceState + + "num_new_requests" : , + "num_in_progress_requests" : , + "num_finished_requests" : , + + "new_requests" : [ + ... + ], + "in_progress_requests" : [ + ... + ], + "finished_requests" : [ + ... + ] + } + +.. note:: + + The ``new_requests``, ``in_progress_requests``, and ``finished_requests`` are arrays of the request objects + that are in the corresponding state. These collections will not be empty only for the following request types: + + - Get info on requests at various stages of processing + - Drain requests at the worker service + + The schema of the request descriptors is the same as the schema of the corresponding original request objects. + +Get the worker status +^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a GET request to the Replication worker to get the status of the worker service: + +.. code-block: + + GET /worker/service/status + +Get info on requests at various stages of processing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a GET request to the Replication worker to get the information on the requests: + +.. code-block: + + GET /worker/service/requests + +Suspend the worker service +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a PUT request to the Replication worker to suspend the worker service: + +.. code-block: + + PUT /worker/service/suspend + +Resume the worker service +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Drain requests at the worker service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a PUT request to the Replication worker to drain (stop) all requests in the worker service: + +.. code-block: + + PUT /worker/service/drain + +The operation affects requests that are already in the processing queue or requests that are still +in the input queue waiting to be procesed. The finished requests are not affected by this operation. + + +Reconfigure the worker service +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Controller sends a PUT request to the Replication worker to reconfigure the worker service: + +.. code-block: + + PUT /worker/service/reconfig diff --git a/doc/dev/index.rst b/doc/dev/index.rst index 0fc6b0cc50..6db85a1a0b 100644 --- a/doc/dev/index.rst +++ b/doc/dev/index.rst @@ -10,3 +10,4 @@ Developer's Guide quick-start-devel doc scisql + api/index diff --git a/doc/ingest/api/reference/rest/general.rst b/doc/ingest/api/reference/rest/general.rst index 85603dd920..193f869a0c 100644 --- a/doc/ingest/api/reference/rest/general.rst +++ b/doc/ingest/api/reference/rest/general.rst @@ -1,3 +1,5 @@ +.. _ingest-general: + General guidelines ==================