lsst
diff --git a/‎deploy/compose/docker-compose.yml‎
Lines changed: 8 additions & 0 deletions b/‎deploy/compose/docker-compose.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/lsst/qserv/schema/migrations/czar/migrate-11-to-12.sql‎
Lines changed: 18 additions & 13 deletions b/‎python/lsst/qserv/schema/migrations/czar/migrate-11-to-12.sql‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎src/admin/templates/proxy/etc/qserv-czar.cnf.jinja‎
Lines changed: 17 additions & 0 deletions b/‎src/admin/templates/proxy/etc/qserv-czar.cnf.jinja‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/cconfig/CzarConfig.h‎
Lines changed: 17 additions & 8 deletions b/‎src/cconfig/CzarConfig.h‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎src/ccontrol/MergingHandler.cc‎
Lines changed: 15 additions & 69 deletions b/‎src/ccontrol/MergingHandler.cc‎
Lines changed: 15 additions & 69 deletions
diff --git a/‎src/ccontrol/MergingHandler.h‎
Lines changed: 10 additions & 9 deletions b/‎src/ccontrol/MergingHandler.h‎
Lines changed: 10 additions & 9 deletions
@@ -36,6 +36,7 @@ volumes:
     volume_czar_xrootd:
     volume_czar_home:
     volume_czar_cfg:
+    volume_czar_transfer:
 
     volume_czar_mariadb_data:
     volume_czar_mariadb_cfg:
@@ -268,6 +269,10 @@ services:
             - type: volume
               source: volume_czar_mariadb_run
               target: /qserv/mariadb/run
+            - type: volume
+              source: volume_czar_transfer
+              target: /tmp
+              
             - << : *log-volume
         expose:
             - "3306" # for czar-mariadb
@@ -304,6 +309,9 @@ services:
             - type: volume
               source: volume_czar_cfg
               target: /config-etc
+            - type: volume
+              source: volume_czar_transfer
+              target: /tmp
             - type: volume
               source: volume_czar_home
               target: /home/qserv
 
@@ -1,14 +1,19 @@
--- -------------------------------------------------------------------
--- Rename table QStatsTmp into QProgress to reflect its purpose
--- and add a foreign key constraint to QInfo table.
--- This table tracks chunk processing progress of the running queries.
--- -------------------------------------------------------------------
-ALTER TABLE QStatsTmp RENAME AS QProgress;
-ALTER TABLE QProgress ADD CONSTRAINT `fk_queryId` FOREIGN KEY (`queryId`) REFERENCES `QInfo` (`queryId`) ON DELETE CASCADE ON UPDATE CASCADE;
-ALTER TABLE QProgress COMMENT = 'Table to track chunk processing progress of the running queries.';
+-- -----------------------------------------------------
+-- Table `chunkMap`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `chunkMap` (
+  `worker` VARCHAR(256) NOT NULL COMMENT 'A unique identifier of a worker hosting the chunk replica',
+  `database` VARCHAR(256) NOT NULL COMMENT 'The name of a database',
+  `table` VARCHAR(256) NOT NULL COMMENT 'The name of a table',
+  `chunk` INT UNSIGNED NOT NULL COMMENT 'The number of a chunk',
+  `size` BIGINT UNSIGNED NOT NULL COMMENT 'The size of a chunk')
+ENGINE = InnoDB
+COMMENT = 'Chunk disposition across workers';
 
--- -------------------------------------------------------------------
--- Drop the QWorker table as it is no longer needed.
--- This table was used to track worker nodes and their statuses.
--- -------------------------------------------------------------------
-DROP TABLE IF EXISTS QWorker;
+-- -----------------------------------------------------
+-- Table `chunkMapStatus`
+-- -----------------------------------------------------
+CREATE TABLE IF NOT EXISTS `chunkMapStatus` (
+  `update_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'The most recent update time of the map')
+ENGINE = InnoDB
+COMMENT = 'Satus info on the chunk map';
@@ -25,12 +25,29 @@ port = {{ czar_db_port }}
 # Any table in resultdb that hasn't been updated in this many days is deleted.
 oldestResultKeptDays = 7
 
+# Either this should be changed to a high performance docker volume directory
+# or /tmp should be mounted as a high performance docker volume directory
+# to avoid using limited docker memory to store the contents.
+transferDir = /tmp
+
 # maximum number of connection retries to SQL databse (per connection attempt)
 maxsqlconnectionattempts = 10
 
 # maximum user query result size in MB
 maxtablesize_mb = 5100
 
+# maximum number of MB of concurrent csv transfer files allowed to be kept in 
+# memory, after this point the will be temporarily written to disk.  
+# 0 is used for testing. 10000 is usually reasonable.
+maxTransferMemMB = 0
+
+# minimum number of MB for each csv transfer file to be kept in memory
+# before possibly going to disk.
+# 0 for testing, up to 10 should be reasonable.
+transferMinMBInMem = 0
+
+
+
 
 # database connection for QMeta database
 [qmeta]
 
@@ -130,8 +130,16 @@ class CzarConfig {
     /// Getters for result aggregation options.
     int getMaxTableSizeMB() const { return _maxTableSizeMB->getVal(); }
     int getMaxSqlConnectionAttempts() const { return _maxSqlConnectionAttempts->getVal(); }
+    unsigned int getMaxTransferMemMB() const { return _resultMaxTransferMemMB->getVal(); }
+    /// Return the transfer directory. This is customizable to allow for a
+    /// high performance volume.
+    std::string getTransferDir() const { return _resultTransferDir->getVal(); }
 
-    /// The size of the TCP connection pool witin the client API that is used
+    /// Return the minimum amount of memory per UberJob to keep in memory. This much transfer
+    /// data will be stored in memory regardless of other conditions.
+    unsigned int getTransferMinMBInMem() const { return _resultTransferMinMBInMem->getVal(); }
+
+    /// The size of the TCP connection pool within the client API that is used
     /// by the merger to pool result files from workers via the HTTP protocol.
     int getResultMaxHttpConnections() const { return _resultMaxHttpConnections->getVal(); }
 
@@ -169,13 +177,6 @@ class CzarConfig {
     /// the method then the monitoring will be disabled.
     unsigned int czarStatsUpdateIvalSec() const { return _czarStatsUpdateIvalSec->getVal(); }
 
-    /// @return The maximum retain period for keeping in memory the relevant metrics
-    /// captured by the Czar monitoring system. If 0 is returned by the method then
-    /// query history archiving will be disabled.
-    /// @note Setting the limit too high may be potentially result in runing onto
-    /// the OOM situation.
-    unsigned int czarStatsRetainPeriodSec() const { return _czarStatsRetainPeriodSec->getVal(); }
-
     /// A worker is considered fully ALIVE if the last update from the worker has been
     /// heard in less than _activeWorkerTimeoutAliveSecs seconds.
     int getActiveWorkerTimeoutAliveSecs() const { return _activeWorkerTimeoutAliveSecs->getVal(); }
@@ -306,6 +307,14 @@ class CzarConfig {
     CVTIntPtr _oldestAsyncResultKeptSeconds = util::ConfigValTInt::create(
             _configValMap, "resultdb", "oldestAsyncResultKeptSeconds", notReq, 3600);
 
+    // This must be larger than _maxTableSizeMB when using the "memory" TransferMethod
+    CVTUIntPtr _resultMaxTransferMemMB =
+            util::ConfigValTUInt::create(_configValMap, "resultdb", "maxTransferMemMB", notReq, 10000);
+    CVTStrPtr _resultTransferDir =
+            util::ConfigValTStr::create(_configValMap, "resultdb", "transferDir", notReq, "/tmp");
+    CVTUIntPtr _resultTransferMinMBInMem =
+            util::ConfigValTUInt::create(_configValMap, "resultdb", "transferMinMBInMem", notReq, 10);
+
     /// Get all the elements in the css section.
     CVTStrPtr _cssTechnology =
             util::ConfigValTStr::create(_configValMap, "css", "technology", notReq, "mysql");
 
@@ -39,17 +39,17 @@
 #include "lsst/log/Log.h"
 
 // Qserv headers
+#include "cconfig/CzarConfig.h"
 #include "ccontrol/msgCode.h"
 #include "global/clock_defs.h"
 #include "global/debugUtil.h"
 #include "http/Client.h"
 #include "http/ClientConnPool.h"
 #include "http/Method.h"
-#include "mysql/CsvBuffer.h"
+#include "mysql/CsvMemDisk.h"
 #include "qdisp/CzarStats.h"
 #include "qdisp/Executive.h"
 #include "qdisp/JobQuery.h"
-#include "qdisp/QueryRequest.h"
 #include "qdisp/UberJob.h"
 #include "rproc/InfileMerger.h"
 #include "util/Bug.h"
@@ -84,7 +84,6 @@ lsst::qserv::TimeCountTracker<double>::CALLBACKFUNC const reportFileRecvRate =
             }
         };
 
-
 string readHttpFileAndMerge(lsst::qserv::qdisp::UberJob::Ptr const& uberJob, string const& httpUrl,
                             size_t fileSize, function<void(char const*, uint32_t)> const& messageIsReady,
                             shared_ptr<http::ClientConnPool> const& httpConnPool) {
@@ -180,51 +179,6 @@ MergingHandler::MergingHandler(std::shared_ptr<rproc::InfileMerger> const& merge
 
 MergingHandler::~MergingHandler() { LOGS(_log, LOG_LVL_TRACE, __func__); }
 
-
-bool MergingHandler::flush(proto::ResponseSummary const& resp) {
-    _wName = resp.wname();
-
-    // This is needed to ensure the job query would be staying alive for the duration
-    // of the operation to prevent inconsistency within the application.
-    auto const jobQuery = getJobQuery().lock();
-    if (jobQuery == nullptr) {
-        LOGS(_log, LOG_LVL_ERROR, __func__ << " failed, jobQuery was NULL");
-        return false;
-    }
-    auto const jobQuery = std::dynamic_pointer_cast<qdisp::JobQuery>(jobBase);
-
-    LOGS(_log, LOG_LVL_TRACE,
-         "MergingHandler::" << __func__ << " jobid=" << resp.jobid() << " transmitsize="
-                            << resp.transmitsize() << " rowcount=" << resp.rowcount() << " rowSize="
-                            << " attemptcount=" << resp.attemptcount() << " errorcode=" << resp.errorcode()
-                            << " errormsg=" << resp.errormsg());
-
-    if (resp.errorcode() != 0 || !resp.errormsg().empty()) {
-        _error = util::Error(resp.errorcode(), resp.errormsg(), util::ErrorCode::MYSQLEXEC);
-        _setError(ccontrol::MSG_RESULT_ERROR, _error.getMsg());
-        LOGS(_log, LOG_LVL_ERROR,
-             "MergingHandler::" << __func__ << " error from worker:" << resp.wname() << " error: " << _error);
-        // This way we can track if the worker has reported this error. The current implementation
-        // requires the large result size to be reported as an error via the InfileMerger regardless
-        // of an origin of the error (Czar or the worker). Note that large results can be produced
-        // by the Czar itself, e.g., when the aggregate result of multiple worker queries is too large
-        // or by the worker when the result set of a single query is too large.
-        // The error will be reported to the Czar as a part of the response summary.
-        if (resp.errorcode() == util::ErrorCode::WORKER_RESULT_TOO_LARGE) {
-            _infileMerger->setResultSizeLimitExceeded();
-        }
-        return false;
-    }
-
-    bool const success = _merge(resp, jobQuery);
-    if (success) {
-        _infileMerger->mergeCompleteFor(resp.jobid());
-        qdisp::CzarStats::get()->addTotalRowsRecv(resp.rowcount());
-        qdisp::CzarStats::get()->addTotalBytesRecv(resp.transmitsize());
-    }
-    return success;
-}
-
 void MergingHandler::errorFlush(std::string const& msg, int code) {
     _setError(code, msg, util::ErrorCode::RESULT_IMPORT);
     // Might want more info from result service.
@@ -243,13 +197,8 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
     }
 
     if (fileSize == 0) return qdisp::MergeEndStatus(true);
-
-    // Read from the http stream and push records into the CSV stream in a separate thread.
-    // Note the fixed capacity of the stream which allows up to 2 records to be buffered
-    // in the stream. This is enough to hide the latency of the HTTP connection and
-    // the time needed to read the file.
-    auto csvStream = mysql::CsvStream::create(2);
-    _csvStream = csvStream;
+    auto csvMemDisk = mysql::CsvMemDisk::create(fileSize, uberJob->getQueryId(), uberJob->getUjId());
+    _csvMemDisk = csvMemDisk;
 
     // This must be after setting _csvStream to avoid cancelFileMerge()
     // race issues, and it needs to be before the thread starts.
@@ -259,46 +208,46 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
     }
 
     string fileReadErrorMsg;
-    thread csvThread([uberJob, csvStream, fileUrl, fileSize, &fileReadErrorMsg]() {
+    auto transferFunc = [&]() {
         size_t bytesRead = 0;
         fileReadErrorMsg = ::readHttpFileAndMerge(
                 uberJob, fileUrl, fileSize,
-                [uberJob, csvStream, fileSize, &bytesRead](char const* buf, uint32_t size) {
+                [&](char const* buf, uint32_t size) {
                     bool last = false;
                     if (buf == nullptr || size == 0) {
                         last = true;
                     } else {
-                        csvStream->push(buf, size);
+                        csvMemDisk->push(buf, size);
                         bytesRead += size;
                         last = bytesRead >= fileSize;
                     }
                     if (last) {
-                        csvStream->push(nullptr, 0);
+                        csvMemDisk->push(nullptr, 0);
                     }
                 },
                 MergingHandler::_getHttpConnPool());
         // Push the stream terminator to indicate the end of the stream.
         // It may be needed to unblock the table merger which may be still attempting to read
         // from the CSV stream.
         if (!fileReadErrorMsg.empty()) {
-            csvStream->push(nullptr, 0);
+            csvMemDisk->push(nullptr, 0);
         }
-    });
+    };
+    csvMemDisk->transferDataFromWorker(transferFunc);
 
     // Attempt the actual merge.
-    bool fileMergeSuccess = _infileMerger->mergeHttp(uberJob, fileSize, csvStream);
+    bool fileMergeSuccess = _infileMerger->mergeHttp(uberJob, fileSize, csvMemDisk);
     if (!fileMergeSuccess) {
         LOGS(_log, LOG_LVL_WARN, __func__ << " merge failed");
         util::Error const& err = _infileMerger->getError();
         _setError(ccontrol::MSG_RESULT_ERROR, err.getMsg(), util::ErrorCode::RESULT_IMPORT);
     }
-    if (csvStream->getContaminated()) {
+    if (csvMemDisk->getContaminated()) {
         LOGS(_log, LOG_LVL_ERROR, __func__ << " merge stream contaminated");
         fileMergeSuccess = false;
         _setError(ccontrol::MSG_RESULT_ERROR, "merge stream contaminated", util::ErrorCode::RESULT_IMPORT);
     }
 
-    csvThread.join();
     if (!fileReadErrorMsg.empty()) {
         LOGS(_log, LOG_LVL_WARN, __func__ << " result file read failed");
         _setError(ccontrol::MSG_HTTP_RESULT, fileReadErrorMsg, util::ErrorCode::RESULT_IMPORT);
@@ -309,14 +258,14 @@ qdisp::MergeEndStatus MergingHandler::_mergeHttp(qdisp::UberJob::Ptr const& uber
     if (!mergeEStatus.success) {
         // This error check needs to come after the csvThread.join() to ensure writing
         // is finished. If any bytes were written, the result table is ruined.
-        mergeEStatus.contaminated = csvStream->getBytesWritten() > 0;
+        mergeEStatus.contaminated = csvMemDisk->getBytesFetched() > 0;
     }
 
     return mergeEStatus;
 }
 
 void MergingHandler::cancelFileMerge() {
-    auto csvStrm = _csvStream.lock();
+    auto csvStrm = _csvMemDisk.lock();
     if (csvStrm != nullptr) {
         csvStrm->cancel();
     }
@@ -342,9 +291,6 @@ qdisp::MergeEndStatus MergingHandler::flushHttp(string const& fileUrl, uint64_t
          "MergingHandler::" << __func__ << " uberJob=" << uberJob->getIdStr() << " fileUrl=" << fileUrl);
 
     qdisp::MergeEndStatus mergeStatus = _mergeHttp(uberJob, fileUrl, fileSize);
-    if (mergeStatus.success) {
-        _infileMerger->mergeCompleteFor(uberJob->getUjId());
-    }
     return mergeStatus;
 }
 
 
@@ -37,7 +37,7 @@ class ClientConnPool;
 }  // namespace lsst::qserv::http
 
 namespace lsst::qserv::mysql {
-class CsvStream;
+class CsvMemDisk;
 }  // namespace lsst::qserv::mysql
 
 namespace lsst::qserv::qdisp {
@@ -91,20 +91,21 @@ class MergingHandler : public qdisp::ResponseHandler {
     /// Set error code and string.
     void _setError(int code, std::string const& msg, int errorState);
 
-    /// Check if the query is no longer active.
-    /// This is used to prevent the query from being processed after it has been cancelled
-    /// or finished for any reason.
-    /// @param jobQuery the query to check
-    /// @return true if the query is no longer active
-    bool _queryIsNoLongerActive(std::shared_ptr<qdisp::JobQuery> const& jobQuery) const;
+    // All instances of the HTTP client class are members of the same pool. This allows
+    // connection reuse and a significant reduction of the kernel memory pressure.
+    // Note that the pool gets instantiated at the very first call to method _getHttpConnPool()
+    // because the instantiation depends on the availability of the Czar configuration.
+    static std::shared_ptr<http::ClientConnPool> const& _getHttpConnPool();
+    static std::shared_ptr<http::ClientConnPool> _httpConnPool;
+    static std::mutex _httpConnPoolMutex;
 
     std::shared_ptr<rproc::InfileMerger> _infileMerger;  ///< Merging delegate
     std::atomic<bool> _errorSet{false};                  ///< Set to true when an error is set.
     bool _flushed{false};                                ///< flushed to InfileMerger?
     std::string _wName{"~"};                             ///< worker name
 
-    std::weak_ptr<qdisp::Executive> _executive;  ///< Weak pointer to the executive for errors.
-    std::weak_ptr<mysql::CsvStream> _csvStream;  ///< Weak pointer to cancel infile merge.
+    std::weak_ptr<qdisp::Executive> _executive;    ///< Weak pointer to the executive for errors.
+    std::weak_ptr<mysql::CsvMemDisk> _csvMemDisk;  ///< Weak pointer to cancel infile merge.
 };
 
 }  // namespace lsst::qserv::ccontrol