Skip to content

Commit c0a2a2c

Browse files
committed
Integration tests working.
1 parent c2263f7 commit c0a2a2c

11 files changed

+90
-28
lines changed

core/modules/ccontrol/MergingHandler.cc

+17-1
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,10 @@ bool MergingHandler::flush(int bLen, BufPtr const& bufPtr, bool& last, bool& lar
9797
throw Bug("MergingHandler invalid blen=" + to_string(bLen) + " from " + _wName);
9898
}
9999

100+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush a");
100101
switch(_state) {
101102
case MsgState::HEADER_WAIT:
103+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush b");
102104
_response->headerSize = static_cast<unsigned char>((*bufPtr)[0]);
103105
if (!proto::ProtoHeaderWrap::unwrap(_response, *bufPtr)) {
104106
std::string sErr = "From:" + _wName + "Error decoding proto header for " + getStateStr(_state);
@@ -119,23 +121,29 @@ bool MergingHandler::flush(int bLen, BufPtr const& bufPtr, bool& last, bool& lar
119121
<< " endNoData=" << endNoData);
120122

121123
_state = MsgState::RESULT_WAIT;
124+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush c");
122125
if (endNoData || nextBufSize == 0) {
126+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush d");
123127
if (!endNoData || nextBufSize != 0 ) {
124128
throw Bug("inconsistent msg termination endNoData=" + std::to_string(endNoData)
125129
+ " nextBufSize=" + std::to_string(nextBufSize));
126130
}
131+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush e");
127132
// Nothing to merge, but some bookkeeping needs to be done.
128133
_infileMerger->mergeCompleteFor(_jobIds);
134+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush f");
129135
last = true;
130136
_state = MsgState::RESULT_RECV;
131137
}
132138
}
133139
return true;
134140
case MsgState::RESULT_WAIT:
135141
{
142+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush g");
136143
nextBufSize = proto::ProtoHeaderWrap::getProtoHeaderSize();
137144
auto job = getJobBase().lock();
138145
if (!_verifyResult(bufPtr, bLen)) { return false; }
146+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush h");
139147
if (!_setResult(bufPtr, bLen)) { // This sets _response->result
140148
LOGS(_log, LOG_LVL_WARN, "setResult failure " << _wName);
141149
return false;
@@ -147,7 +155,9 @@ bool MergingHandler::flush(int bLen, BufPtr const& bufPtr, bool& last, bool& lar
147155
_jobIds.insert(jobId);
148156
LOGS(_log, LOG_LVL_DEBUG, "Flushed last=" << last << " for tableName=" << _tableName);
149157

158+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush i");
150159
auto success = _merge();
160+
LOGS(_log, LOG_LVL_INFO, "&&& MH::flush j");
151161
_response.reset(new WorkerResponse());
152162
return success;
153163
}
@@ -218,18 +228,24 @@ void MergingHandler::_initState() {
218228
}
219229

220230
bool MergingHandler::_merge() {
221-
if (auto job = getJobBase().lock()) {
231+
LOGS(_log, LOG_LVL_INFO, "&&& MH::_merge a");
232+
auto job = getJobBase().lock();
233+
if (job != nullptr) {
234+
LOGS(_log, LOG_LVL_INFO, "&&& MH::_merge b");
222235
if (_flushed) {
223236
throw Bug("MergingRequester::_merge : already flushed");
224237
}
238+
LOGS(_log, LOG_LVL_INFO, "&&& MH::_merge c");
225239
bool success = _infileMerger->merge(_response);
240+
LOGS(_log, LOG_LVL_INFO, "&&& MH::_merge d");
226241
if (!success) {
227242
LOGS(_log, LOG_LVL_WARN, "_merge() failed");
228243
rproc::InfileMergerError const& err = _infileMerger->getError();
229244
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg());
230245
_state = MsgState::RESULT_ERR;
231246
}
232247
_response.reset();
248+
LOGS(_log, LOG_LVL_INFO, "&&& MH::_merge end");
233249
return success;
234250
}
235251
LOGS(_log, LOG_LVL_ERROR, "MergingHandler::_merge() failed, jobQuery was NULL");

core/modules/ccontrol/UserQuerySelect.cc

+10-3
Original file line numberDiff line numberDiff line change
@@ -414,12 +414,12 @@ void UserQuerySelect::submit() {
414414
std::shared_ptr<ChunkMsgReceiver> cmr = ChunkMsgReceiver::newInstance(uberJobId, _messageStore);
415415
auto respHandler = std::make_shared<MergingHandler>(cmr, _infileMerger, uberResultName);
416416

417+
string workerResourceName = workerIter->first;
418+
deque<int>& dq = workerIter->second;
417419
auto uJob = qdisp::UberJob::create(_executive, respHandler, _qMetaQueryId,
418-
uberJobId++, _qMetaCzarId);
420+
uberJobId++, _qMetaCzarId, workerResourceName);
419421

420422
int chunksInUber = 0;
421-
deque<int>& dq = workerIter->second;
422-
423423
while (!dq.empty() && !chunksInQuery.empty() && chunksInUber < maxChunksPerUber) {
424424
int chunkIdWorker = dq.front();
425425
dq.pop_front();
@@ -459,15 +459,22 @@ void UserQuerySelect::submit() {
459459
// If any chunks in the query were not found on a worker's list, run them individually.
460460
//&&&_executive->startRemainingJobs(chunksInQuery); //&&& delete func in Executive.
461461
for (auto& ciq:chunksInQuery) {
462+
LOGS(_log, LOG_LVL_INFO, "&&& submit q1");
462463
qdisp::JobQuery* jqRaw = ciq.second;
464+
LOGS(_log, LOG_LVL_INFO, "&&& submit q2");
463465
qdisp::JobQuery::Ptr job = _executive->getSharedPtrForRawJobPtr(jqRaw);
466+
LOGS(_log, LOG_LVL_INFO, "&&& submit q3");
464467
std::function<void(util::CmdData*)> funcBuildJob =
465468
[this, job{move(job)}](util::CmdData*) { // references in captures cause races
466469
QSERV_LOGCONTEXT_QUERY(_qMetaQueryId);
470+
LOGS(_log, LOG_LVL_INFO, "&&& submit q run1");
467471
job->runJob();
472+
LOGS(_log, LOG_LVL_INFO, "&&& submit q run2");
468473
};
469474
auto cmd = std::make_shared<qdisp::PriorityCommand>(funcBuildJob);
475+
LOGS(_log, LOG_LVL_INFO, "&&& submit q4");
470476
_executive->queueJobStart(cmd);
477+
LOGS(_log, LOG_LVL_INFO, "&&& submit q5");
471478
}
472479

473480
LOGS(_log, LOG_LVL_INFO, "&&& submit r");

core/modules/czar/WorkerResources.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ map<string, deque<int>> WorkerResources::getDequesFor(string const& dbName) {
108108

109109

110110
void WorkerResources::setMonoNodeTest() {
111-
string wName("/worker/worker");
111+
string wName("/worker/5257fbab-c49c-11eb-ba7a-1856802308a2");
112112
std::lock_guard<std::mutex> lg(_workerMapMtx);
113113
_insertWorker(wName);
114114
auto iter = _workers.find(wName);

core/modules/qdisp/Executive.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,8 @@ bool Executive::startUberJob(UberJob::Ptr const& uJob) {
614614
//&&&XrdSsiResource jobResource(jobQuery->getDescription()->resource().path(), "", jobQuery->getIdStr(), "", 0, affinity);
615615
// Affinity should be meaningless here as there should only be one instance of each worker.
616616
XrdSsiResource::Affinity affinity = XrdSsiResource::Affinity::Default;
617-
XrdSsiResource uJobResource(uJob->workerResource, "", uJob->getIdStr(), "", 0, affinity);
617+
LOGS(_log, LOG_LVL_INFO, "&&& uJob->workerResource=" << uJob->getWorkerResource());
618+
XrdSsiResource uJobResource(uJob->getWorkerResource(), "", uJob->getIdStr(), "", 0, affinity);
618619

619620
// Now construct the actual query request and tie it to the jobQuery. The
620621
// shared pointer is used by QueryRequest to keep itself alive, sloppy design.

core/modules/qdisp/QueryRequest.cc

+21-10
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ QueryRequest::~QueryRequest() {
222222
// content of request data
223223
char* QueryRequest::GetRequest(int& requestLength) {
224224
QSERV_LOGCONTEXT_QUERY_JOB(_qid, _jobid);
225+
LOGS(_log, LOG_LVL_INFO, "&&& QueryRequest::GetRequest");
225226
lock_guard<mutex> lock(_finishStatusMutex);
226227
auto jq = _job;
227228
if (_finishStatus != ACTIVE || jq == nullptr) {
@@ -448,7 +449,7 @@ void QueryRequest::_processData(JobBase::Ptr const& jq, int blen, bool xrdLast)
448449
ResponseHandler::BufPtr bufPtr = _askForResponseDataCmd->getBufPtr();
449450
_askForResponseDataCmd.reset(); // No longer need it, and don't want the destructor calling _errorFinish().
450451

451-
452+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData a");
452453
int const protoHeaderSize = proto::ProtoHeaderWrap::getProtoHeaderSize();
453454
ResponseHandler::BufPtr nextHeaderBufPtr;
454455

@@ -463,19 +464,20 @@ void QueryRequest::_processData(JobBase::Ptr const& jq, int blen, bool xrdLast)
463464
// - The first (bytes = blen - ProtoHeaderWrap::getProtheaderSize())
464465
// is the result associated with the previously received header.
465466
// - The second is the header for the next message.
466-
467467
int respSize = blen - protoHeaderSize;
468468
nextHeaderBufPtr = make_shared<vector<char>>(bufPtr->begin() + respSize, bufPtr->end());
469-
469+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData b");
470470
// Read the result
471471
/* &&& rebase
472472
flushOk = jq->getDescription()->respHandler()->flush(respSize, bufPtr, last,
473473
largeResult, nextBufSize);
474474
*/
475-
bool largeResult = false;
476-
int nextBufSize = 0;
477-
bool last = false;
478-
bool flushOk = jq->getRespHandler()->flush(respSize, bufPtr, last, largeResult, nextBufSize);
475+
//&&& bool largeResult = false;
476+
//&&& int nextBufSize = 0;
477+
//&&& bool last = false;
478+
// Values for last, largeResult, and nextBufSize filled in by flush
479+
flushOk = jq->getRespHandler()->flush(respSize, bufPtr, last, largeResult, nextBufSize);
480+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData c");
479481
if (last) {
480482
// Last should only be true when the header is read, not the result.
481483
throw Bug("_processData result had 'last' true, which cannot be allowed.");
@@ -486,7 +488,7 @@ void QueryRequest::_processData(JobBase::Ptr const& jq, int blen, bool xrdLast)
486488
throw Bug("Unexpected header size from flush(result) call QID="
487489
+ to_string(_qid) + "#" + to_string(_jobid));
488490
}
489-
491+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData d");
490492
if (!flushOk) {
491493
_flushError(jq);
492494
return;
@@ -497,11 +499,14 @@ void QueryRequest::_processData(JobBase::Ptr const& jq, int blen, bool xrdLast)
497499
// Values for largeResult, last, and nextBufSize will be filled in by flush().
498500
flushOk = jq->getDescription()->respHandler()->flush(protoHeaderSize, nextHeaderBufPtr, last,
499501
largeResult, nextBufSize);
500-
*/
502+
501503
largeResult = false;
502504
nextBufSize = 0;
505+
*/
506+
// Values for last, largeResult, and nextBufSize filled in by flush
507+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData e");
503508
flushOk = jq->getRespHandler()->flush(protoHeaderSize, nextHeaderBufPtr, last, largeResult, nextBufSize);
504-
509+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData f");
505510
if (largeResult) {
506511
if (!_largeResult) LOGS(_log, LOG_LVL_DEBUG, "holdState largeResult set to true");
507512
_largeResult = true; // Once the worker indicates it's a large result, it stays that way.
@@ -512,22 +517,28 @@ void QueryRequest::_processData(JobBase::Ptr const& jq, int blen, bool xrdLast)
512517
LOGS(_log, LOG_LVL_DEBUG, "processData disagreement between last=" << last
513518
<< " and xrdLast=" << xrdLast);
514519
}
520+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData g");
515521
if (last) {
522+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData h");
516523
jq->getStatus()->updateInfo(_jobIdStr, JobStatus::COMPLETE);
517524
_finish();
518525
// At this point all blocks for this job have been read, there's no point in
519526
// having XrdSsi wait for anything.
520527
return;
521528
} else {
529+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData i");
522530
_askForResponseDataCmd = make_shared<AskForResponseDataCmd>(shared_from_this(), jq, nextBufSize);
523531
LOGS(_log, LOG_LVL_DEBUG, "queuing askForResponseDataCmd bufSize=" << nextBufSize);
532+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData j");
524533
_queueAskForResponse(_askForResponseDataCmd, jq, false);
525534
}
526535
} else {
536+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData k");
527537
LOGS(_log, LOG_LVL_WARN, "flushOk = false");
528538
_flushError(jq);
529539
return;
530540
}
541+
LOGS(_log, LOG_LVL_INFO, "&&&QueryRequest::_processData end");
531542
return;
532543
}
533544

core/modules/qdisp/UberJob.cc

+9-5
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,17 @@ namespace qdisp {
4949

5050
UberJob::Ptr UberJob::create(Executive::Ptr const& executive,
5151
std::shared_ptr<ResponseHandler> const& respHandler,
52-
int queryId, int uberJobId, qmeta::CzarId czarId) {
53-
UberJob::Ptr uJob(new UberJob(executive, respHandler, queryId, uberJobId, czarId));
52+
int queryId, int uberJobId, qmeta::CzarId czarId, string const& workerResource) {
53+
UberJob::Ptr uJob(new UberJob(executive, respHandler, queryId, uberJobId, czarId, workerResource));
54+
uJob->_setup();
5455
return uJob;
5556
}
5657

5758
UberJob::UberJob(Executive::Ptr const& executive,
5859
std::shared_ptr<ResponseHandler> const& respHandler,
59-
int queryId, int uberJobId, qmeta::CzarId czarId)
60-
: JobBase(), _executive(executive), _respHandler(respHandler), _queryId(queryId), _uberJobId(uberJobId),
60+
int queryId, int uberJobId, qmeta::CzarId czarId, string const& workerResource)
61+
: JobBase(), _workerResource(workerResource), _executive(executive),
62+
_respHandler(respHandler), _queryId(queryId), _uberJobId(uberJobId),
6163
_czarId(czarId), _idStr("QID=" + to_string(_queryId) + ":uber=" + to_string(uberJobId)) {
6264
_qdispPool = executive->getQdispPool();
6365
_jobStatus = make_shared<JobStatus>();
@@ -182,13 +184,15 @@ void UberJob::callMarkCompleteFunc(bool success) {
182184
throw Bug("&&&NEED_CODE may need code to properly handle failed uberjob");
183185
}
184186
for (auto&& job:_jobs) {
187+
string idStr = job->getIdStr();
188+
job->getStatus()->updateInfo(idStr, JobStatus::COMPLETE);
185189
job->callMarkCompleteFunc(success);
186190
}
187191
}
188192

189193

190194
std::ostream& UberJob::dumpOS(std::ostream &os) const {
191-
os << "(workerResource=" << workerResource
195+
os << "(workerResource=" << _workerResource
192196
<< " jobs sz=" << _jobs.size() << "(";
193197
for (auto const& job:_jobs) {
194198
JobDescription::Ptr desc = job->getDescription();

core/modules/qdisp/UberJob.h

+11-5
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class UberJob : public JobBase {
4343

4444
static Ptr create(Executive::Ptr const& executive,
4545
std::shared_ptr<ResponseHandler> const& respHandler,
46-
int queryId, int uberJobId, qmeta::CzarId czarId);
46+
int queryId, int uberJobId, qmeta::CzarId czarId, std::string const& workerResource);
4747
UberJob() = delete;
4848
UberJob(UberJob const&) = delete;
4949
UberJob& operator=(UberJob const&) = delete;
@@ -73,17 +73,22 @@ class UberJob : public JobBase {
7373

7474
bool verifyPayload() const;
7575

76+
std::string getWorkerResource() { return _workerResource; }
77+
7678
/// &&& TODO:UJ may not need,
7779
void prepScrubResults();
7880

79-
std::string workerResource; // TODO:UJ make private
80-
8181
std::ostream& dumpOS(std::ostream &os) const override;
8282

8383
private:
8484
UberJob(Executive::Ptr const& executive,
8585
std::shared_ptr<ResponseHandler> const& respHandler,
86-
int queryId, int uberJobId, qmeta::CzarId czarId);
86+
int queryId, int uberJobId, qmeta::CzarId czarId, std::string const& workerResource);
87+
88+
void _setup() {
89+
JobBase::Ptr jbPtr = shared_from_this();
90+
_respHandler->setJobQuery(jbPtr);
91+
}
8792

8893
std::vector<JobQuery*> _jobs;
8994
std::atomic<bool> _started{false};
@@ -93,7 +98,8 @@ class UberJob : public JobBase {
9398
std::shared_ptr<QueryRequest> _queryRequestPtr;
9499
std::mutex _qrMtx;
95100

96-
std::string _payload; ///< XrdSsi message to be sent to the worker resource.
101+
std::string const _workerResource;
102+
std::string _payload; ///< XrdSsi message to be sent to the _workerResource.
97103

98104
std::weak_ptr<Executive> _executive;
99105
std::shared_ptr<ResponseHandler> _respHandler;

core/modules/wbase/SendChannelShared.cc

+4
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ void SendChannelShared::setTaskCount(int taskCount) {
6363
_taskCount = taskCount;
6464
}
6565

66+
void SendChannelShared::incrTaskCountBy(int partialCount) {
67+
_taskCount += partialCount;
68+
}
69+
6670

6771
bool SendChannelShared::transmitTaskLast(StreamGuard sLock, bool inLast) {
6872
/// _caller must have locked _streamMutex before calling this.

core/modules/wbase/SendChannelShared.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ namespace wbase {
4343
/// A class that provides a SendChannel object with synchronization so it can be
4444
/// shared by across multiple threads. Due to what may be sent, the synchronization locking
4545
/// is needs to be available outside of the class.
46+
/// Note: Tasks on a SendChannelShared cannot start processing until the total number of
47+
/// Tasks using the SendChannelShared is know. Otherwise, there is a race condition
48+
/// which could close the channel too soon.
4649
class SendChannelShared {
4750
public:
4851
using Ptr = std::shared_ptr<SendChannelShared>;
@@ -95,6 +98,9 @@ class SendChannelShared {
9598
/// This should not be changed once set.
9699
void setTaskCount(int taskCount);
97100

101+
/// All of the tasks that use this SendChannel must be added
102+
/// to the scheduler queue at the same time or it risks a race condition.
103+
void incrTaskCountBy(int subCount);
98104

99105
/// Try to transmit the data in tData.
100106
/// If the queue already has at least 2 TransmitData objects, addTransmit
@@ -151,7 +157,7 @@ class SendChannelShared {
151157
/// metadata buffer. Once set, it cannot change until after Finish() has been called.
152158
std::string _metadataBuf;
153159

154-
int _taskCount = 0; ///< The number of tasks to be sent over this SendChannel.
160+
std::atomic<int> _taskCount{0}; ///< The number of tasks to be sent over this SendChannel.
155161
int _lastCount = 0; ///< Then number of 'last' buffers received.
156162
std::atomic<bool> _lastRecvd{false}; ///< The truly 'last' transmit message is in the queue.
157163
std::atomic<bool> _firstTransmit{true}; ///< True until the first transmit has been sent.

core/modules/wbase/Task.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ std::vector<Task::Ptr> Task::createTasks(proto::TaskMsg const& taskMsg,
166166
vect.push_back(task);
167167
}
168168
} else {
169+
LOGS(_log, LOG_LVL_INFO, "&&& Task::createTasks queryStr=" << queryStr);
169170
auto task = std::make_shared<wbase::Task>(taskMsg, queryStr, fragNum, sendChannel, gArena, rmLock);
170171
//TODO: Maybe? Is it better to move fragment info from
171172
// ChunkResource getResourceFragment(int i) to here???
@@ -176,7 +177,9 @@ std::vector<Task::Ptr> Task::createTasks(proto::TaskMsg const& taskMsg,
176177

177178
}
178179
}
179-
sendChannel->setTaskCount(vect.size());
180+
LOGS(_log, LOG_LVL_INFO, "&&& Task::createTasks vect.size=" << vect.size());
181+
//&&&sendChannel->setTaskCount(vect.size());
182+
sendChannel->incrTaskCountBy(vect.size());
180183
return vect;
181184
}
182185

0 commit comments

Comments
 (0)