Skip to content

Commit 0b0fe1e

Browse files
committed
Added family map option to not use chunk size for distribution.
1 parent ccaebac commit 0b0fe1e

File tree

86 files changed

+1016
-1393
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+1016
-1393
lines changed

admin/tools/docker/base/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ RUN dnf install -y 'dnf-command(config-manager)' \
5252
protobuf-devel \
5353
python3.12 \
5454
python3.12-devel \
55+
jemalloc \
5556
tree \
5657
vim \
5758
zip \

src/cconfig/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ target_sources(cconfig PRIVATE
44
CzarConfig.cc
55
)
66

7-
target_include_directories(cconfig PRIVATE
8-
${XROOTD_INCLUDE_DIRS}
9-
)
107

118
target_link_libraries(cconfig PUBLIC
129
log
13-
XrdSsiLib
10+
)
11+
12+
install(
13+
TARGETS cconfig
1414
)
1515

1616
install(

src/cconfig/CzarConfig.cc

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include <stdexcept>
2929

3030
// Third party headers
31-
#include "XrdSsi/XrdSsiLogger.hh"
3231

3332
// LSST headers
3433
#include "lsst/log/Log.h"
@@ -43,20 +42,6 @@ namespace {
4342

4443
LOG_LOGGER _log = LOG_GET("lsst.qserv.cconfig.CzarConfig");
4544

46-
void QservLogger(struct timeval const& mtime, unsigned long tID, const char* msg, int mlen) {
47-
static log4cxx::spi::LocationInfo xrdLoc(
48-
"client", log4cxx::spi::LocationInfo::calcShortFileName("client"), "<xrdssi>", 0);
49-
static LOG_LOGGER myLog = LOG_GET("lsst.qserv.xrdssi.msgs");
50-
51-
if (myLog.isInfoEnabled()) {
52-
while (mlen && msg[mlen - 1] == '\n') --mlen; // strip all trailing newlines
53-
std::string theMsg(msg, mlen);
54-
lsst::log::Log::MDC("LWP", std::to_string(tID));
55-
myLog.logMsg(log4cxx::Level::getInfo(), xrdLoc, theMsg);
56-
}
57-
}
58-
59-
bool dummy = XrdSsiLogger::SetMCB(QservLogger, XrdSsiLogger::mcbClient);
6045
} // namespace
6146

6247
namespace lsst::qserv::cconfig {

src/ccontrol/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ add_dependencies(ccontrol proto)
33

44
target_include_directories(ccontrol PRIVATE
55
${ANTLR4_INCLUDE_DIR}
6-
${XROOTD_INCLUDE_DIRS}
76
)
87

98
target_sources(ccontrol PRIVATE
@@ -27,11 +26,15 @@ target_sources(ccontrol PRIVATE
2726
target_link_libraries(ccontrol PUBLIC
2827
boost_regex
2928
cconfig
29+
css
3030
log
3131
parser
32-
replica
32+
proto
3333
sphgeom
34-
XrdCl
34+
)
35+
36+
install(
37+
TARGETS ccontrol
3538
)
3639

3740
install(
@@ -65,3 +68,6 @@ ccontrol_tests(
6568
testCControl
6669
testUserQueryType
6770
)
71+
72+
# set_tests_properties(testCControl PROPERTIES WILL_FAIL 1)
73+

src/ccontrol/MergingHandler.cc

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434

3535
// Third-party headers
3636
#include "curl/curl.h"
37-
#include "XrdCl/XrdClFile.hh"
3837

3938
// LSST headers
4039
#include "lsst/log/Log.h"
@@ -127,7 +126,7 @@ std::tuple<bool, bool> readHttpFileAndMergeHttp(
127126
uint32_t msgSizeBytes = 0;
128127
bool success = true;
129128
bool mergeHappened = false;
130-
int headerCount = 0;
129+
uint64_t headerCount = 0;
131130
uint64_t totalBytesRead = 0;
132131
try {
133132
auto exec = uberJob->getExecutive();
@@ -211,11 +210,9 @@ std::tuple<bool, bool> readHttpFileAndMergeHttp(
211210
}
212211

213212
// Parse and evaluate the message.
214-
//&&&mergeHappened = messageIsReady(msgBuf.get(), msgSizeBytes, last);
215213
mergeHappened = true;
216214
bool messageReadyResult = messageIsReady(msgBuf.get(), msgSizeBytes, last);
217215
totalBytesRead += msgSizeBytes;
218-
//&&&if (!mergeHappened) {
219216
if (!messageReadyResult) {
220217
success = false;
221218
throw runtime_error("message processing failed at offset " +
@@ -265,10 +262,23 @@ std::tuple<bool, bool> readHttpFileAndMergeHttp(
265262

266263
namespace lsst::qserv::ccontrol {
267264

268-
MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> merger, std::string const& tableName)
269-
: _infileMerger{merger}, _tableName{tableName} {}
265+
shared_ptr<http::ClientConnPool> MergingHandler::_httpConnPool;
266+
mutex MergingHandler::_httpConnPoolMutex;
270267

271-
MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__ << " " << _tableName); }
268+
shared_ptr<http::ClientConnPool> const& MergingHandler::_getHttpConnPool() {
269+
lock_guard<mutex> const lock(_httpConnPoolMutex);
270+
if (nullptr == _httpConnPool) {
271+
_httpConnPool = make_shared<http::ClientConnPool>(
272+
cconfig::CzarConfig::instance()->getResultMaxHttpConnections());
273+
}
274+
return _httpConnPool;
275+
}
276+
277+
MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merger,
278+
std::shared_ptr<qdisp::Executive> const& exec)
279+
: _infileMerger(merger), _executive(exec) {}
280+
281+
MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__); }
272282

273283

274284
bool MergingHandler::flush(proto::ResponseSummary const& resp) {
@@ -317,35 +327,14 @@ bool MergingHandler::flush(proto::ResponseSummary const& resp) {
317327
}
318328

319329
void MergingHandler::errorFlush(std::string const& msg, int code) {
320-
_setError(code, msg);
330+
_setError(code, msg, util::ErrorCode::RESULT_IMPORT);
321331
// Might want more info from result service.
322332
// Do something about the error. FIXME.
323333
LOGS(_log, LOG_LVL_ERROR, "Error receiving result.");
324334
}
325335

326-
// Note that generally we always have an _infileMerger object except during
327-
// a unit test. I suppose we could try to figure out how to create one.
328-
//
329-
void MergingHandler::prepScrubResults(int jobId, int attemptCount) {
330-
if (_infileMerger) _infileMerger->prepScrub(jobId, attemptCount);
331-
}
332-
333336
std::ostream& MergingHandler::print(std::ostream& os) const {
334-
return os << "MergingRequester(" << _tableName << ", flushed=" << (_flushed ? "true)" : "false)");
335-
}
336-
337-
bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
338-
proto::ResponseData const& responseData) {
339-
if (_flushed) {
340-
throw util::Bug(ERR_LOC, "already flushed");
341-
}
342-
bool const success = _infileMerger->mergeHttp(uberJob, responseData);
343-
if (!success) {
344-
LOGS(_log, LOG_LVL_WARN, __func__ << " failed");
345-
util::Error const& err = _infileMerger->getError();
346-
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg());
347-
}
348-
return success;
337+
return os << "MergingRequester(flushed=" << (_flushed ? "true)" : "false)");
349338
}
350339

351340
bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
@@ -357,15 +346,16 @@ bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
357346
if (!success) {
358347
LOGS(_log, LOG_LVL_WARN, __func__ << " failed");
359348
util::Error const& err = _infileMerger->getError();
360-
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg());
349+
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg(), util::ErrorCode::RESULT_IMPORT);
361350
}
362351
return success;
363352
}
364353

365-
void MergingHandler::_setError(int code, std::string const& msg) {
354+
void MergingHandler::_setError(int code, std::string const& msg, int errorState) {
366355
LOGS(_log, LOG_LVL_DEBUG, "_setError: code: " << code << ", message: " << msg);
367-
std::lock_guard<std::mutex> lock(_errorMutex);
368-
_error = Error(code, msg);
356+
auto exec = _executive.lock();
357+
if (exec == nullptr) return;
358+
exec->addMultiError(code, msg, errorState);
369359
}
370360

371361
tuple<bool, bool> MergingHandler::flushHttp(string const& fileUrl, uint64_t expectedRows,
@@ -415,10 +405,9 @@ tuple<bool, bool> MergingHandler::flushHttp(string const& fileUrl, uint64_t expe
415405
return {success, shouldCancel};
416406
}
417407

418-
void MergingHandler::flushHttpError(int errorCode, std::string const& errorMsg, int status) {
408+
void MergingHandler::flushHttpError(int errorCode, std::string const& errorMsg, int errState) {
419409
if (!_errorSet.exchange(true)) {
420-
_error = util::Error(errorCode, errorMsg, util::ErrorCode::MYSQLEXEC);
421-
_setError(ccontrol::MSG_RESULT_ERROR, _error.getMsg());
410+
_setError(errorCode, errorMsg, errState);
422411
}
423412
}
424413

src/ccontrol/MergingHandler.h

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class ResponseSummary;
3939
} // namespace lsst::qserv::proto
4040

4141
namespace lsst::qserv::qdisp {
42+
class Executive;
4243
class JobQuery;
4344
class UberJob;
4445
} // namespace lsst::qserv::qdisp
@@ -64,20 +65,8 @@ class MergingHandler : public qdisp::ResponseHandler {
6465
virtual ~MergingHandler();
6566

6667
/// @param merger downstream merge acceptor
67-
/// @param tableName target table for incoming data
68-
MergingHandler(std::shared_ptr<rproc::InfileMerger> merger, std::string const& tableName);
69-
70-
/// Process the response and read the result file if no error was reported by a worker.
71-
/// @return true if successful (no error)
72-
bool flush(proto::ResponseSummary const& resp) override;
73-
74-
/// @see ResponseHandler::flushHttp
75-
/// @see MerginHandler::_mergeHttp
76-
std::tuple<bool, bool> flushHttp(std::string const& fileUrl, uint64_t expectedRows,
77-
uint64_t& resultRows) override;
78-
79-
/// @see ResponseHandler::flushHttpError
80-
void flushHttpError(int errorCode, std::string const& errorMsg, int status) override;
68+
MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merger,
69+
std::shared_ptr<qdisp::Executive> const& exec);
8170

8271
/// @see ResponseHandler::flushHttp
8372
/// @see MerginHandler::_mergeHttp
@@ -93,18 +82,12 @@ class MergingHandler : public qdisp::ResponseHandler {
9382
/// Print a string representation of the receiver to an ostream
9483
std::ostream& print(std::ostream& os) const override;
9584

96-
/// @return an error code and description
97-
Error getError() const override {
98-
std::lock_guard<std::mutex> lock(_errorMutex);
99-
return _error;
100-
}
101-
10285
private:
10386
/// Call InfileMerger to do the work of merging this data to the result.
10487
bool _mergeHttp(std::shared_ptr<qdisp::UberJob> const& uberJob, proto::ResponseData const& responseData);
10588

10689
/// Set error code and string.
107-
void _setError(int code, std::string const& msg);
90+
void _setError(int code, std::string const& msg, int errorState);
10891

10992
/// Check if the query is no longer active.
11093
/// This is used to prevent the query from being processed after it has been cancelled
@@ -114,12 +97,11 @@ class MergingHandler : public qdisp::ResponseHandler {
11497
bool _queryIsNoLongerActive(std::shared_ptr<qdisp::JobQuery> const& jobQuery) const;
11598

11699
std::shared_ptr<rproc::InfileMerger> _infileMerger; ///< Merging delegate
117-
std::string _tableName; ///< Target table name
118-
Error _error; ///< Error description
119100
std::atomic<bool> _errorSet{false}; ///< Set to true when an error is set.
120-
mutable std::mutex _errorMutex; ///< Protect readers from partial updates
121101
bool _flushed{false}; ///< flushed to InfileMerger?
122102
std::string _wName{"~"}; ///< worker name
103+
104+
std::weak_ptr<qdisp::Executive> _executive; ///< Weak pointer to the executive for errors.
123105
};
124106

125107
} // namespace lsst::qserv::ccontrol

src/ccontrol/UserQuerySelect.cc

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@
8686
#include "global/LogContext.h"
8787
#include "proto/worker.pb.h"
8888
#include "qdisp/Executive.h"
89-
#include "qdisp/MessageStore.h"
89+
#include "qdisp/JobQuery.h"
90+
#include "qmeta/MessageStore.h"
9091
#include "qmeta/QMeta.h"
9192
#include "qmeta/Exceptions.h"
9293
#include "qmeta/QMeta.h"
@@ -284,7 +285,6 @@ void UserQuerySelect::submit() {
284285
cs = _qSession->buildChunkQuerySpec(queryTemplates, chunkSpec, fillInChunkIdTag);
285286
chunks.push_back(cs->chunkId);
286287
}
287-
std::string chunkResultName = _ttn->make(cs->chunkId);
288288

289289
// This should only need to be set once as all jobs should have the same database name.
290290
if (cs->db != dbName) {
@@ -299,9 +299,8 @@ void UserQuerySelect::submit() {
299299

300300
ResourceUnit ru;
301301
ru.setAsDbChunk(cs->db, cs->chunkId);
302-
qdisp::JobDescription::Ptr jobDesc = qdisp::JobDescription::create(
303-
_qMetaCzarId, exec->getId(), sequence, ru,
304-
std::make_shared<MergingHandler>(_infileMerger, chunkResultName), cs, chunkResultName);
302+
qdisp::JobDescription::Ptr jobDesc =
303+
qdisp::JobDescription::create(_qMetaCzarId, exec->getId(), sequence, ru, cs);
305304
auto job = exec->add(jobDesc);
306305
++sequence;
307306
}
@@ -339,6 +338,14 @@ void UserQuerySelect::buildAndSendUberJobs() {
339338
return;
340339
}
341340

341+
if (exec->getCancelled() || exec->getSuperfluous()) {
342+
LOGS(_log, LOG_LVL_INFO, funcN << " executive cancelled.");
343+
}
344+
345+
if (exec->getSuperfluous()) {
346+
LOGS(_log, LOG_LVL_INFO, funcN << " executive superfluous, result already found.");
347+
}
348+
342349
// Only one thread should be generating UberJobs for this user query at any given time.
343350
lock_guard fcLock(_buildUberJobMtx);
344351
LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << exec->getTotalJobs());
@@ -394,13 +401,15 @@ void UserQuerySelect::buildAndSendUberJobs() {
394401
map<string, WInfoAndUJPtr::Ptr> workerJobMap;
395402
vector<qdisp::Executive::ChunkIdType> missingChunks;
396403

404+
int attemptCountIncreased = 0;
397405
// unassignedChunksInQuery needs to be in numerical order so that UberJobs contain chunk numbers in
398406
// numerical order. The workers run shared scans in numerical order of chunkId numbers.
399407
// Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum,
400408
// and should minimize the time for the first UberJob on the worker to complete.
401409
for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) {
402410
bool const increaseAttemptCount = true;
403411
jqPtr->getDescription()->incrAttemptCount(exec, increaseAttemptCount);
412+
attemptCountIncreased++;
404413

405414
// If too many workers are down, there will be a chunk that cannot be found.
406415
// Just continuing should leave jobs `unassigned` with their attempt count
@@ -462,7 +471,8 @@ void UserQuerySelect::buildAndSendUberJobs() {
462471
if (wInfUJ->uberJobPtr == nullptr) {
463472
auto ujId = _uberJobIdSeq++; // keep ujId consistent
464473
string uberResultName = _ttn->make(ujId);
465-
auto respHandler = make_shared<ccontrol::MergingHandler>(_infileMerger, uberResultName);
474+
auto respHandler =
475+
ccontrol::MergingHandler::Ptr(new ccontrol::MergingHandler(_infileMerger, exec));
466476
auto uJob = qdisp::UberJob::create(exec, respHandler, exec->getId(), ujId, _qMetaCzarId,
467477
targetWorker);
468478
uJob->setWorkerContactInfo(wInfUJ->wInf);
@@ -489,6 +499,11 @@ void UserQuerySelect::buildAndSendUberJobs() {
489499
LOGS(_log, LOG_LVL_ERROR, errStr);
490500
}
491501

502+
if (attemptCountIncreased > 0) {
503+
LOGS(_log, LOG_LVL_WARN,
504+
funcN << " increased attempt count for " << attemptCountIncreased << " Jobs");
505+
}
506+
492507
// Queue unqued UberJobs, these have less than the max number of jobs.
493508
for (auto const& [wIdKey, winfUjPtr] : workerJobMap) {
494509
if (winfUjPtr != nullptr) {

src/ccontrol/UserQuerySelect.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ class UserQuerySelect : public UserQuery {
9595
UserQuerySelect(UserQuerySelect const&) = delete;
9696
UserQuerySelect& operator=(UserQuerySelect const&) = delete;
9797

98+
~UserQuerySelect() override = default;
99+
98100
/**
99101
* @param resultLocation: Result location, if empty use result table with unique
100102
* name generated from query ID.

0 commit comments

Comments
 (0)