Skip to content

Commit fcd3698

Browse files
jgates108fritzm
authored andcommitted
Added memory/disk hybrid for transfering csv files.
1 parent 84107b2 commit fcd3698

40 files changed

+869
-377
lines changed

deploy/compose/docker-compose.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ volumes:
3636
volume_czar_xrootd:
3737
volume_czar_home:
3838
volume_czar_cfg:
39+
volume_czar_transfer:
3940

4041
volume_czar_mariadb_data:
4142
volume_czar_mariadb_cfg:
@@ -268,6 +269,10 @@ services:
268269
- type: volume
269270
source: volume_czar_mariadb_run
270271
target: /qserv/mariadb/run
272+
- type: volume
273+
source: volume_czar_transfer
274+
target: /tmp
275+
271276
- << : *log-volume
272277
expose:
273278
- "3306" # for czar-mariadb
@@ -304,6 +309,9 @@ services:
304309
- type: volume
305310
source: volume_czar_cfg
306311
target: /config-etc
312+
- type: volume
313+
source: volume_czar_transfer
314+
target: /tmp
307315
- type: volume
308316
source: volume_czar_home
309317
target: /home/qserv
Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
-- -------------------------------------------------------------------
2-
-- Rename table QStatsTmp into QProgress to reflect its purpose
3-
-- and add a foreign key constraint to QInfo table.
4-
-- This table tracks chunk processing progress of the running queries.
5-
-- -------------------------------------------------------------------
6-
ALTER TABLE QStatsTmp RENAME AS QProgress;
7-
ALTER TABLE QProgress ADD CONSTRAINT `fk_queryId` FOREIGN KEY (`queryId`) REFERENCES `QInfo` (`queryId`) ON DELETE CASCADE ON UPDATE CASCADE;
8-
ALTER TABLE QProgress COMMENT = 'Table to track chunk processing progress of the running queries.';
1+
-- -----------------------------------------------------
2+
-- Table `chunkMap`
3+
-- -----------------------------------------------------
4+
CREATE TABLE IF NOT EXISTS `chunkMap` (
5+
`worker` VARCHAR(256) NOT NULL COMMENT 'A unique identifier of a worker hosting the chunk replica',
6+
`database` VARCHAR(256) NOT NULL COMMENT 'The name of a database',
7+
`table` VARCHAR(256) NOT NULL COMMENT 'The name of a table',
8+
`chunk` INT UNSIGNED NOT NULL COMMENT 'The number of a chunk',
9+
`size` BIGINT UNSIGNED NOT NULL COMMENT 'The size of a chunk')
10+
ENGINE = InnoDB
11+
COMMENT = 'Chunk disposition across workers';
912

10-
-- -------------------------------------------------------------------
11-
-- Drop the QWorker table as it is no longer needed.
12-
-- This table was used to track worker nodes and their statuses.
13-
-- -------------------------------------------------------------------
14-
DROP TABLE IF EXISTS QWorker;
13+
-- -----------------------------------------------------
14+
-- Table `chunkMapStatus`
15+
-- -----------------------------------------------------
16+
CREATE TABLE IF NOT EXISTS `chunkMapStatus` (
17+
`update_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'The most recent update time of the map')
18+
ENGINE = InnoDB
19+
COMMENT = 'Satus info on the chunk map';

src/admin/templates/proxy/etc/qserv-czar.cnf.jinja

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,29 @@ port = {{ czar_db_port }}
2525
# Any table in resultdb that hasn't been updated in this many days is deleted.
2626
oldestResultKeptDays = 7
2727

28+
# Either this should be changed to a high performance docker volume directory
29+
# or /tmp should be mounted as a high performance docker volume directory
30+
# to avoid using limited docker memory to store the contents.
31+
transferDir = /tmp
32+
2833
# maximum number of connection retries to SQL databse (per connection attempt)
2934
maxsqlconnectionattempts = 10
3035

3136
# maximum user query result size in MB
3237
maxtablesize_mb = 5100
3338

39+
# maximum number of MB of concurrent csv transfer files allowed to be kept in
40+
# memory, after this point the will be temporarily written to disk.
41+
# 0 is used for testing. 10000 is usually reasonable.
42+
maxTransferMemMB = 0
43+
44+
# minimum number of MB for each csv transfer file to be kept in memory
45+
# before possibly going to disk.
46+
# 0 for testing, up to 10 should be reasonable.
47+
transferMinMBInMem = 0
48+
49+
50+
3451

3552
# database connection for QMeta database
3653
[qmeta]

src/cconfig/CzarConfig.h

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,16 @@ class CzarConfig {
130130
/// Getters for result aggregation options.
131131
int getMaxTableSizeMB() const { return _maxTableSizeMB->getVal(); }
132132
int getMaxSqlConnectionAttempts() const { return _maxSqlConnectionAttempts->getVal(); }
133+
unsigned int getMaxTransferMemMB() const { return _resultMaxTransferMemMB->getVal(); }
134+
/// Return the transfer directory. This is customizable to allow for a
135+
/// high performance volume.
136+
std::string getTransferDir() const { return _resultTransferDir->getVal(); }
133137

134-
/// The size of the TCP connection pool witin the client API that is used
138+
/// Return the minimum amount of memory per UberJob to keep in memory. This much transfer
139+
/// data will be stored in memory regardless of other conditions.
140+
unsigned int getTransferMinMBInMem() const { return _resultTransferMinMBInMem->getVal(); }
141+
142+
/// The size of the TCP connection pool within the client API that is used
135143
/// by the merger to pool result files from workers via the HTTP protocol.
136144
int getResultMaxHttpConnections() const { return _resultMaxHttpConnections->getVal(); }
137145

@@ -169,13 +177,6 @@ class CzarConfig {
169177
/// the method then the monitoring will be disabled.
170178
unsigned int czarStatsUpdateIvalSec() const { return _czarStatsUpdateIvalSec->getVal(); }
171179

172-
/// @return The maximum retain period for keeping in memory the relevant metrics
173-
/// captured by the Czar monitoring system. If 0 is returned by the method then
174-
/// query history archiving will be disabled.
175-
/// @note Setting the limit too high may be potentially result in runing onto
176-
/// the OOM situation.
177-
unsigned int czarStatsRetainPeriodSec() const { return _czarStatsRetainPeriodSec->getVal(); }
178-
179180
/// A worker is considered fully ALIVE if the last update from the worker has been
180181
/// heard in less than _activeWorkerTimeoutAliveSecs seconds.
181182
int getActiveWorkerTimeoutAliveSecs() const { return _activeWorkerTimeoutAliveSecs->getVal(); }
@@ -306,6 +307,14 @@ class CzarConfig {
306307
CVTIntPtr _oldestAsyncResultKeptSeconds = util::ConfigValTInt::create(
307308
_configValMap, "resultdb", "oldestAsyncResultKeptSeconds", notReq, 3600);
308309

310+
// This must be larger than _maxTableSizeMB when using the "memory" TransferMethod
311+
CVTUIntPtr _resultMaxTransferMemMB =
312+
util::ConfigValTUInt::create(_configValMap, "resultdb", "maxTransferMemMB", notReq, 10000);
313+
CVTStrPtr _resultTransferDir =
314+
util::ConfigValTStr::create(_configValMap, "resultdb", "transferDir", notReq, "/tmp");
315+
CVTUIntPtr _resultTransferMinMBInMem =
316+
util::ConfigValTUInt::create(_configValMap, "resultdb", "transferMinMBInMem", notReq, 10);
317+
309318
/// Get all the elements in the css section.
310319
CVTStrPtr _cssTechnology =
311320
util::ConfigValTStr::create(_configValMap, "css", "technology", notReq, "mysql");

src/ccontrol/MergingHandler.cc

Lines changed: 15 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,17 @@
3939
#include "lsst/log/Log.h"
4040

4141
// Qserv headers
42+
#include "cconfig/CzarConfig.h"
4243
#include "ccontrol/msgCode.h"
4344
#include "global/clock_defs.h"
4445
#include "global/debugUtil.h"
4546
#include "http/Client.h"
4647
#include "http/ClientConnPool.h"
4748
#include "http/Method.h"
48-
#include "mysql/CsvBuffer.h"
49+
#include "mysql/CsvMemDisk.h"
4950
#include "qdisp/CzarStats.h"
5051
#include "qdisp/Executive.h"
5152
#include "qdisp/JobQuery.h"
52-
#include "qdisp/QueryRequest.h"
5353
#include "qdisp/UberJob.h"
5454
#include "rproc/InfileMerger.h"
5555
#include "util/Bug.h"
@@ -84,7 +84,6 @@ lsst::qserv::TimeCountTracker<double>::CALLBACKFUNC const reportFileRecvRate =
8484
}
8585
};
8686

87-
8887
string readHttpFileAndMerge(lsst::qserv::qdisp::UberJob::Ptr const& uberJob, string const& httpUrl,
8988
size_t fileSize, function<void(char const*, uint32_t)> const& messageIsReady,
9089
shared_ptr<http::ClientConnPool> const& httpConnPool) {
@@ -180,51 +179,6 @@ MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merge
180179

181180
MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__); }
182181

183-
184-
bool MergingHandler::flush(proto::ResponseSummary const& resp) {
185-
_wName = resp.wname();
186-
187-
// This is needed to ensure the job query would be staying alive for the duration
188-
// of the operation to prevent inconsistency within the application.
189-
auto const jobQuery = getJobQuery().lock();
190-
if (jobQuery == nullptr) {
191-
LOGS(_log, LOG_LVL_ERROR, __func__ << " failed, jobQuery was NULL");
192-
return false;
193-
}
194-
auto const jobQuery = std::dynamic_pointer_cast<qdisp::JobQuery>(jobBase);
195-
196-
LOGS(_log, LOG_LVL_TRACE,
197-
"MergingHandler::" << __func__ << " jobid=" << resp.jobid() << " transmitsize="
198-
<< resp.transmitsize() << " rowcount=" << resp.rowcount() << " rowSize="
199-
<< " attemptcount=" << resp.attemptcount() << " errorcode=" << resp.errorcode()
200-
<< " errormsg=" << resp.errormsg());
201-
202-
if (resp.errorcode() != 0 || !resp.errormsg().empty()) {
203-
_error = util::Error(resp.errorcode(), resp.errormsg(), util::ErrorCode::MYSQLEXEC);
204-
_setError(ccontrol::MSG_RESULT_ERROR, _error.getMsg());
205-
LOGS(_log, LOG_LVL_ERROR,
206-
"MergingHandler::" << __func__ << " error from worker:" << resp.wname() << " error: " << _error);
207-
// This way we can track if the worker has reported this error. The current implementation
208-
// requires the large result size to be reported as an error via the InfileMerger regardless
209-
// of an origin of the error (Czar or the worker). Note that large results can be produced
210-
// by the Czar itself, e.g., when the aggregate result of multiple worker queries is too large
211-
// or by the worker when the result set of a single query is too large.
212-
// The error will be reported to the Czar as a part of the response summary.
213-
if (resp.errorcode() == util::ErrorCode::WORKER_RESULT_TOO_LARGE) {
214-
_infileMerger->setResultSizeLimitExceeded();
215-
}
216-
return false;
217-
}
218-
219-
bool const success = _merge(resp, jobQuery);
220-
if (success) {
221-
_infileMerger->mergeCompleteFor(resp.jobid());
222-
qdisp::CzarStats::get()->addTotalRowsRecv(resp.rowcount());
223-
qdisp::CzarStats::get()->addTotalBytesRecv(resp.transmitsize());
224-
}
225-
return success;
226-
}
227-
228182
void MergingHandler::errorFlush(std::string const& msg, int code) {
229183
_setError(code, msg, util::ErrorCode::RESULT_IMPORT);
230184
// Might want more info from result service.
@@ -243,13 +197,8 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
243197
}
244198

245199
if (fileSize == 0) return qdisp::MergeEndStatus(true);
246-
247-
// Read from the http stream and push records into the CSV stream in a separate thread.
248-
// Note the fixed capacity of the stream which allows up to 2 records to be buffered
249-
// in the stream. This is enough to hide the latency of the HTTP connection and
250-
// the time needed to read the file.
251-
auto csvStream = mysql::CsvStream::create(2);
252-
_csvStream = csvStream;
200+
auto csvMemDisk = mysql::CsvMemDisk::create(fileSize, uberJob->getQueryId(), uberJob->getUjId());
201+
_csvMemDisk = csvMemDisk;
253202

254203
// This must be after setting _csvStream to avoid cancelFileMerge()
255204
// race issues, and it needs to be before the thread starts.
@@ -259,46 +208,46 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
259208
}
260209

261210
string fileReadErrorMsg;
262-
thread csvThread([uberJob, csvStream, fileUrl, fileSize, &fileReadErrorMsg]() {
211+
auto transferFunc = [&]() {
263212
size_t bytesRead = 0;
264213
fileReadErrorMsg = ::readHttpFileAndMerge(
265214
uberJob, fileUrl, fileSize,
266-
[uberJob, csvStream, fileSize, &bytesRead](char const* buf, uint32_t size) {
215+
[&](char const* buf, uint32_t size) {
267216
bool last = false;
268217
if (buf == nullptr || size == 0) {
269218
last = true;
270219
} else {
271-
csvStream->push(buf, size);
220+
csvMemDisk->push(buf, size);
272221
bytesRead += size;
273222
last = bytesRead >= fileSize;
274223
}
275224
if (last) {
276-
csvStream->push(nullptr, 0);
225+
csvMemDisk->push(nullptr, 0);
277226
}
278227
},
279228
MergingHandler::_getHttpConnPool());
280229
// Push the stream terminator to indicate the end of the stream.
281230
// It may be needed to unblock the table merger which may be still attempting to read
282231
// from the CSV stream.
283232
if (!fileReadErrorMsg.empty()) {
284-
csvStream->push(nullptr, 0);
233+
csvMemDisk->push(nullptr, 0);
285234
}
286-
});
235+
};
236+
csvMemDisk->transferDataFromWorker(transferFunc);
287237

288238
// Attempt the actual merge.
289-
bool fileMergeSuccess = _infileMerger->mergeHttp(uberJob, fileSize, csvStream);
239+
bool fileMergeSuccess = _infileMerger->mergeHttp(uberJob, fileSize, csvMemDisk);
290240
if (!fileMergeSuccess) {
291241
LOGS(_log, LOG_LVL_WARN, __func__ << " merge failed");
292242
util::Error const& err = _infileMerger->getError();
293243
_setError(ccontrol::MSG_RESULT_ERROR, err.getMsg(), util::ErrorCode::RESULT_IMPORT);
294244
}
295-
if (csvStream->getContaminated()) {
245+
if (csvMemDisk->getContaminated()) {
296246
LOGS(_log, LOG_LVL_ERROR, __func__ << " merge stream contaminated");
297247
fileMergeSuccess = false;
298248
_setError(ccontrol::MSG_RESULT_ERROR, "merge stream contaminated", util::ErrorCode::RESULT_IMPORT);
299249
}
300250

301-
csvThread.join();
302251
if (!fileReadErrorMsg.empty()) {
303252
LOGS(_log, LOG_LVL_WARN, __func__ << " result file read failed");
304253
_setError(ccontrol::MSG_HTTP_RESULT, fileReadErrorMsg, util::ErrorCode::RESULT_IMPORT);
@@ -309,14 +258,14 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
309258
if (!mergeEStatus.success) {
310259
// This error check needs to come after the csvThread.join() to ensure writing
311260
// is finished. If any bytes were written, the result table is ruined.
312-
mergeEStatus.contaminated = csvStream->getBytesWritten() > 0;
261+
mergeEStatus.contaminated = csvMemDisk->getBytesFetched() > 0;
313262
}
314263

315264
return mergeEStatus;
316265
}
317266

318267
void MergingHandler::cancelFileMerge() {
319-
auto csvStrm = _csvStream.lock();
268+
auto csvStrm = _csvMemDisk.lock();
320269
if (csvStrm != nullptr) {
321270
csvStrm->cancel();
322271
}
@@ -342,9 +291,6 @@ qdisp::MergeEndStatus MergingHandler::flushHttp(string const& fileUrl, uint64_t
342291
"MergingHandler::" << __func__ << " uberJob=" << uberJob->getIdStr() << " fileUrl=" << fileUrl);
343292

344293
qdisp::MergeEndStatus mergeStatus = _mergeHttp(uberJob, fileUrl, fileSize);
345-
if (mergeStatus.success) {
346-
_infileMerger->mergeCompleteFor(uberJob->getUjId());
347-
}
348294
return mergeStatus;
349295
}
350296

src/ccontrol/MergingHandler.h

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class ClientConnPool;
3737
} // namespace lsst::qserv::http
3838

3939
namespace lsst::qserv::mysql {
40-
class CsvStream;
40+
class CsvMemDisk;
4141
} // namespace lsst::qserv::mysql
4242

4343
namespace lsst::qserv::qdisp {
@@ -91,20 +91,21 @@ class MergingHandler : public qdisp::ResponseHandler {
9191
/// Set error code and string.
9292
void _setError(int code, std::string const& msg, int errorState);
9393

94-
/// Check if the query is no longer active.
95-
/// This is used to prevent the query from being processed after it has been cancelled
96-
/// or finished for any reason.
97-
/// @param jobQuery the query to check
98-
/// @return true if the query is no longer active
99-
bool _queryIsNoLongerActive(std::shared_ptr<qdisp::JobQuery> const& jobQuery) const;
94+
// All instances of the HTTP client class are members of the same pool. This allows
95+
// connection reuse and a significant reduction of the kernel memory pressure.
96+
// Note that the pool gets instantiated at the very first call to method _getHttpConnPool()
97+
// because the instantiation depends on the availability of the Czar configuration.
98+
static std::shared_ptr<http::ClientConnPool> const& _getHttpConnPool();
99+
static std::shared_ptr<http::ClientConnPool> _httpConnPool;
100+
static std::mutex _httpConnPoolMutex;
100101

101102
std::shared_ptr<rproc::InfileMerger> _infileMerger; ///< Merging delegate
102103
std::atomic<bool> _errorSet{false}; ///< Set to true when an error is set.
103104
bool _flushed{false}; ///< flushed to InfileMerger?
104105
std::string _wName{"~"}; ///< worker name
105106

106-
std::weak_ptr<qdisp::Executive> _executive; ///< Weak pointer to the executive for errors.
107-
std::weak_ptr<mysql::CsvStream> _csvStream; ///< Weak pointer to cancel infile merge.
107+
std::weak_ptr<qdisp::Executive> _executive; ///< Weak pointer to the executive for errors.
108+
std::weak_ptr<mysql::CsvMemDisk> _csvMemDisk; ///< Weak pointer to cancel infile merge.
108109
};
109110

110111
} // namespace lsst::qserv::ccontrol

0 commit comments

Comments
 (0)