Skip to content

Commit 32084a7

Browse files
committed
Added family map option to not use chunk size for distribution.
1 parent 8307937 commit 32084a7

File tree

89 files changed

+939
-1405
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+939
-1405
lines changed

admin/tools/docker/base/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ RUN dnf install -y 'dnf-command(config-manager)' \
5252
protobuf-devel \
5353
python3.12 \
5454
python3.12-devel \
55+
jemalloc \
5556
tree \
5657
vim \
5758
zip \

src/cconfig/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ target_sources(cconfig PRIVATE
44
CzarConfig.cc
55
)
66

7-
target_include_directories(cconfig PRIVATE
8-
${XROOTD_INCLUDE_DIRS}
9-
)
107

118
target_link_libraries(cconfig PUBLIC
129
log
13-
XrdSsiLib
10+
)
11+
12+
install(
13+
TARGETS cconfig
1414
)
1515

1616
install(

src/cconfig/CzarConfig.cc

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include <stdexcept>
2929

3030
// Third party headers
31-
#include "XrdSsi/XrdSsiLogger.hh"
3231

3332
// LSST headers
3433
#include "lsst/log/Log.h"
@@ -42,20 +41,6 @@ namespace {
4241

4342
LOG_LOGGER _log = LOG_GET("lsst.qserv.cconfig.CzarConfig");
4443

45-
void QservLogger(struct timeval const& mtime, unsigned long tID, const char* msg, int mlen) {
46-
static log4cxx::spi::LocationInfo xrdLoc(
47-
"client", log4cxx::spi::LocationInfo::calcShortFileName("client"), "<xrdssi>", 0);
48-
static LOG_LOGGER myLog = LOG_GET("lsst.qserv.xrdssi.msgs");
49-
50-
if (myLog.isInfoEnabled()) {
51-
while (mlen && msg[mlen - 1] == '\n') --mlen; // strip all trailing newlines
52-
std::string theMsg(msg, mlen);
53-
lsst::log::Log::MDC("LWP", std::to_string(tID));
54-
myLog.logMsg(log4cxx::Level::getInfo(), xrdLoc, theMsg);
55-
}
56-
}
57-
58-
bool dummy = XrdSsiLogger::SetMCB(QservLogger, XrdSsiLogger::mcbClient);
5944
} // namespace
6045

6146
namespace lsst::qserv::cconfig {

src/ccontrol/CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ add_dependencies(ccontrol proto)
33

44
target_include_directories(ccontrol PRIVATE
55
${ANTLR4_INCLUDE_DIR}
6-
${XROOTD_INCLUDE_DIRS}
76
)
87

98
target_sources(ccontrol PRIVATE
@@ -29,11 +28,15 @@ target_sources(ccontrol PRIVATE
2928
target_link_libraries(ccontrol PUBLIC
3029
boost_regex
3130
cconfig
31+
css
3232
log
3333
parser
34-
replica
34+
proto
3535
sphgeom
36-
XrdCl
36+
)
37+
38+
install(
39+
TARGETS ccontrol
3740
)
3841

3942
install(
@@ -67,3 +70,6 @@ ccontrol_tests(
6770
testCControl
6871
testUserQueryType
6972
)
73+
74+
# set_tests_properties(testCControl PROPERTIES WILL_FAIL 1)
75+

src/ccontrol/MergingHandler.cc

Lines changed: 14 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333

3434
// Third-party headers
3535
#include "curl/curl.h"
36-
#include "XrdCl/XrdClFile.hh"
3736

3837
// LSST headers
3938
#include "lsst/log/Log.h"
@@ -126,7 +125,7 @@ std::tuple<bool, bool> readHttpFileAndMergeHttp(
126125
uint32_t msgSizeBytes = 0;
127126
bool success = true;
128127
bool mergeHappened = false;
129-
int headerCount = 0;
128+
uint64_t headerCount = 0;
130129
uint64_t totalBytesRead = 0;
131130
try {
132131
auto exec = uberJob->getExecutive();
@@ -210,11 +209,9 @@ std::tuple<bool, bool> readHttpFileAndMergeHttp(
210209
}
211210

212211
// Parse and evaluate the message.
213-
//&&&mergeHappened = messageIsReady(msgBuf.get(), msgSizeBytes, last);
214212
mergeHappened = true;
215213
bool messageReadyResult = messageIsReady(msgBuf.get(), msgSizeBytes, last);
216214
totalBytesRead += msgSizeBytes;
217-
//&&&if (!mergeHappened) {
218215
if (!messageReadyResult) {
219216
success = false;
220217
throw runtime_error("message processing failed at offset " +
@@ -276,41 +273,21 @@ shared_ptr<http::ClientConnPool> const& MergingHandler::_getHttpConnPool() {
276273
return _httpConnPool;
277274
}
278275

279-
MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> merger, std::string const& tableName)
280-
: _infileMerger{merger}, _tableName{tableName} {}
276+
MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merger,
277+
std::shared_ptr<qdisp::Executive> const& exec)
278+
: _infileMerger(merger), _executive(exec) {}
281279

282-
MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__ << " " << _tableName); }
280+
MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__); }
283281

284282
void MergingHandler::errorFlush(std::string const& msg, int code) {
285-
_setError(code, msg);
283+
_setError(code, msg, util::ErrorCode::RESULT_IMPORT);
286284
// Might want more info from result service.
287285
// Do something about the error. FIXME.
288286
LOGS(_log, LOG_LVL_ERROR, "Error receiving result.");
289287
}
290288

291-
// Note that generally we always have an _infileMerger object except during
292-
// a unit test. I suppose we could try to figure out how to create one.
293-
//
294-
void MergingHandler::prepScrubResults(int jobId, int attemptCount) {
295-
if (_infileMerger) _infileMerger->prepScrub(jobId, attemptCount);
296-
}
297-
298289
std::ostream& MergingHandler::print(std::ostream& os) const {
299-
return os << "MergingRequester(" << _tableName << ", flushed=" << (_flushed ? "true)" : "false)");
300-
}
301-
302-
bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
303-
proto::ResponseData const& responseData) {
304-
if (_flushed) {
305-
throw util::Bug(ERR_LOC, "already flushed");
306-
}
307-
bool const success = _infileMerger->mergeHttp(uberJob, responseData);
308-
if (!success) {
309-
LOGS(_log, LOG_LVL_WARN, __func__ << " failed");
310-
util::Error const& err = _infileMerger->getError();
311-
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg());
312-
}
313-
return success;
290+
return os << "MergingRequester(flushed=" << (_flushed ? "true)" : "false)");
314291
}
315292

316293
bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
@@ -322,15 +299,16 @@ bool MergingHandler::_mergeHttp(shared_ptr<qdisp::UberJob> const& uberJob,
322299
if (!success) {
323300
LOGS(_log, LOG_LVL_WARN, __func__ << " failed");
324301
util::Error const& err = _infileMerger->getError();
325-
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg());
302+
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg(), util::ErrorCode::RESULT_IMPORT);
326303
}
327304
return success;
328305
}
329306

330-
void MergingHandler::_setError(int code, std::string const& msg) {
307+
void MergingHandler::_setError(int code, std::string const& msg, int errorState) {
331308
LOGS(_log, LOG_LVL_DEBUG, "_setError: code: " << code << ", message: " << msg);
332-
std::lock_guard<std::mutex> lock(_errorMutex);
333-
_error = Error(code, msg);
309+
auto exec = _executive.lock();
310+
if (exec == nullptr) return;
311+
exec->addMultiError(code, msg, errorState);
334312
}
335313

336314
tuple<bool, bool> MergingHandler::flushHttp(string const& fileUrl, uint64_t expectedRows,
@@ -380,10 +358,9 @@ tuple<bool, bool> MergingHandler::flushHttp(string const& fileUrl, uint64_t expe
380358
return {success, shouldCancel};
381359
}
382360

383-
void MergingHandler::flushHttpError(int errorCode, std::string const& errorMsg, int status) {
361+
void MergingHandler::flushHttpError(int errorCode, std::string const& errorMsg, int errState) {
384362
if (!_errorSet.exchange(true)) {
385-
_error = util::Error(errorCode, errorMsg, util::ErrorCode::MYSQLEXEC);
386-
_setError(ccontrol::MSG_RESULT_ERROR, _error.getMsg());
363+
_setError(errorCode, errorMsg, errState);
387364
}
388365
}
389366

src/ccontrol/MergingHandler.h

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class ResponseSummary;
4343
} // namespace lsst::qserv::proto
4444

4545
namespace lsst::qserv::qdisp {
46+
class Executive;
4647
class JobQuery;
4748
class UberJob;
4849
} // namespace lsst::qserv::qdisp
@@ -68,16 +69,8 @@ class MergingHandler : public qdisp::ResponseHandler {
6869
virtual ~MergingHandler();
6970

7071
/// @param merger downstream merge acceptor
71-
/// @param tableName target table for incoming data
72-
MergingHandler(std::shared_ptr<rproc::InfileMerger> merger, std::string const& tableName);
73-
74-
/// @see ResponseHandler::flushHttp
75-
/// @see MerginHandler::_mergeHttp
76-
std::tuple<bool, bool> flushHttp(std::string const& fileUrl, uint64_t expectedRows,
77-
uint64_t& resultRows) override;
78-
79-
/// @see ResponseHandler::flushHttpError
80-
void flushHttpError(int errorCode, std::string const& errorMsg, int status) override;
72+
MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merger,
73+
std::shared_ptr<qdisp::Executive> const& exec);
8174

8275
/// @see ResponseHandler::flushHttp
8376
/// @see MerginHandler::_mergeHttp
@@ -93,18 +86,12 @@ class MergingHandler : public qdisp::ResponseHandler {
9386
/// Print a string representation of the receiver to an ostream
9487
std::ostream& print(std::ostream& os) const override;
9588

96-
/// @return an error code and description
97-
Error getError() const override {
98-
std::lock_guard<std::mutex> lock(_errorMutex);
99-
return _error;
100-
}
101-
10289
private:
10390
/// Call InfileMerger to do the work of merging this data to the result.
10491
bool _mergeHttp(std::shared_ptr<qdisp::UberJob> const& uberJob, proto::ResponseData const& responseData);
10592

10693
/// Set error code and string.
107-
void _setError(int code, std::string const& msg);
94+
void _setError(int code, std::string const& msg, int errorState);
10895

10996
// All instances of the HTTP client class are members of the same pool. This allows
11097
// connection reuse and a significant reduction of the kernel memory pressure.
@@ -115,12 +102,11 @@ class MergingHandler : public qdisp::ResponseHandler {
115102
static std::mutex _httpConnPoolMutex;
116103

117104
std::shared_ptr<rproc::InfileMerger> _infileMerger; ///< Merging delegate
118-
std::string _tableName; ///< Target table name
119-
Error _error; ///< Error description
120105
std::atomic<bool> _errorSet{false}; ///< Set to true when an error is set.
121-
mutable std::mutex _errorMutex; ///< Protect readers from partial updates
122106
bool _flushed{false}; ///< flushed to InfileMerger?
123107
std::string _wName{"~"}; ///< worker name
108+
109+
std::weak_ptr<qdisp::Executive> _executive; ///< Weak pointer to the executive for errors.
124110
};
125111

126112
} // namespace lsst::qserv::ccontrol

src/ccontrol/UserQuerySelect.cc

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@
8686
#include "global/LogContext.h"
8787
#include "proto/worker.pb.h"
8888
#include "qdisp/Executive.h"
89-
#include "qdisp/MessageStore.h"
89+
#include "qdisp/JobQuery.h"
90+
#include "qmeta/MessageStore.h"
9091
#include "qmeta/QMeta.h"
9192
#include "qmeta/Exceptions.h"
9293
#include "qproc/geomAdapter.h"
@@ -285,7 +286,6 @@ void UserQuerySelect::submit() {
285286
cs = _qSession->buildChunkQuerySpec(queryTemplates, chunkSpec, fillInChunkIdTag);
286287
chunks.push_back(cs->chunkId);
287288
}
288-
std::string chunkResultName = _ttn->make(cs->chunkId);
289289

290290
// This should only need to be set once as all jobs should have the same database name.
291291
if (cs->db != dbName) {
@@ -300,9 +300,8 @@ void UserQuerySelect::submit() {
300300

301301
ResourceUnit ru;
302302
ru.setAsDbChunk(cs->db, cs->chunkId);
303-
qdisp::JobDescription::Ptr jobDesc = qdisp::JobDescription::create(
304-
_qMetaCzarId, exec->getId(), sequence, ru,
305-
std::make_shared<MergingHandler>(_infileMerger, chunkResultName), cs, chunkResultName);
303+
qdisp::JobDescription::Ptr jobDesc =
304+
qdisp::JobDescription::create(_qMetaCzarId, exec->getId(), sequence, ru, cs);
306305
auto job = exec->add(jobDesc);
307306
++sequence;
308307
}
@@ -340,6 +339,14 @@ void UserQuerySelect::buildAndSendUberJobs() {
340339
return;
341340
}
342341

342+
if (exec->getCancelled() || exec->getSuperfluous()) {
343+
LOGS(_log, LOG_LVL_INFO, funcN << " executive cancelled.");
344+
}
345+
346+
if (exec->getSuperfluous()) {
347+
LOGS(_log, LOG_LVL_INFO, funcN << " executive superfluous, result already found.");
348+
}
349+
343350
// Only one thread should be generating UberJobs for this user query at any given time.
344351
lock_guard fcLock(_buildUberJobMtx);
345352
LOGS(_log, LOG_LVL_DEBUG, "UserQuerySelect::" << __func__ << " totalJobs=" << exec->getTotalJobs());
@@ -395,13 +402,15 @@ void UserQuerySelect::buildAndSendUberJobs() {
395402
map<string, WInfoAndUJPtr::Ptr> workerJobMap;
396403
vector<qdisp::Executive::ChunkIdType> missingChunks;
397404

405+
int attemptCountIncreased = 0;
398406
// unassignedChunksInQuery needs to be in numerical order so that UberJobs contain chunk numbers in
399407
// numerical order. The workers run shared scans in numerical order of chunkId numbers.
400408
// Numerical order keeps the number of partially complete UberJobs running on a worker to a minimum,
401409
// and should minimize the time for the first UberJob on the worker to complete.
402410
for (auto const& [chunkId, jqPtr] : unassignedChunksInQuery) {
403411
bool const increaseAttemptCount = true;
404412
jqPtr->getDescription()->incrAttemptCount(exec, increaseAttemptCount);
413+
attemptCountIncreased++;
405414

406415
// If too many workers are down, there will be a chunk that cannot be found.
407416
// Just continuing should leave jobs `unassigned` with their attempt count
@@ -463,7 +472,8 @@ void UserQuerySelect::buildAndSendUberJobs() {
463472
if (wInfUJ->uberJobPtr == nullptr) {
464473
auto ujId = _uberJobIdSeq++; // keep ujId consistent
465474
string uberResultName = _ttn->make(ujId);
466-
auto respHandler = make_shared<ccontrol::MergingHandler>(_infileMerger, uberResultName);
475+
auto respHandler =
476+
ccontrol::MergingHandler::Ptr(new ccontrol::MergingHandler(_infileMerger, exec));
467477
auto uJob = qdisp::UberJob::create(exec, respHandler, exec->getId(), ujId, _qMetaCzarId,
468478
targetWorker);
469479
uJob->setWorkerContactInfo(wInfUJ->wInf);
@@ -490,6 +500,11 @@ void UserQuerySelect::buildAndSendUberJobs() {
490500
LOGS(_log, LOG_LVL_ERROR, errStr);
491501
}
492502

503+
if (attemptCountIncreased > 0) {
504+
LOGS(_log, LOG_LVL_WARN,
505+
funcN << " increased attempt count for " << attemptCountIncreased << " Jobs");
506+
}
507+
493508
// Queue unqued UberJobs, these have less than the max number of jobs.
494509
for (auto const& [wIdKey, winfUjPtr] : workerJobMap) {
495510
if (winfUjPtr != nullptr) {

src/ccontrol/UserQuerySelect.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ class UserQuerySelect : public UserQuery {
100100
UserQuerySelect(UserQuerySelect const&) = delete;
101101
UserQuerySelect& operator=(UserQuerySelect const&) = delete;
102102

103+
~UserQuerySelect() override = default;
104+
103105
/**
104106
* @param resultLocation: Result location, if empty use result table with unique
105107
* name generated from query ID.

0 commit comments

Comments
 (0)