neoml-lib · favorart · Jun 24, 2023
diff --git a/NeoML/Python/src/PyDnnDistributed.cpp b/NeoML/Python/src/PyDnnDistributed.cpp
@@ -1,4 +1,5 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

diff --git a/NeoML/Python/src/PyDnnDistributed.h b/NeoML/Python/src/PyDnnDistributed.h
@@ -1,4 +1,5 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -20,7 +21,7 @@ limitations under the License.
 
 class CPyDistributedDataset : public IDistributedDataset {
 public:
-	CPyDistributedDataset( const py::object& data ) : getData( data ) {};
+	CPyDistributedDataset( const py::object& data ) : getData( data ) {}
 	int SetInputBatch( CDnn& dnn, int thread ) override;
 private:
 	py::object getData;
@@ -29,13 +30,14 @@ class CPyDistributedDataset : public IDistributedDataset {
 class CPyDistributedTraining : public CDistributedTraining {
 public:
 	CPyDistributedTraining( CDnn& dnn, int count, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( dnn, count, initializer, seed ) {};
+		: CDistributedTraining( dnn, count, initializer, seed ) {}
 	CPyDistributedTraining( CArchive& archive, int count, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( archive, count, initializer, seed ) {};
+		: CDistributedTraining( archive, count, initializer, seed ) {}
 	CPyDistributedTraining( CDnn& dnn, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {};
+		: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {}
 	CPyDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
-		: CDistributedTraining( archive, cudaDevs, initializer, seed ) {};
+		: CDistributedTraining( archive, cudaDevs, initializer, seed ) {}
+
 	void Run( const py::object& data );
 	void RunAndBackward( const py::object& data );
 	void Learn( const py::object& data );
@@ -46,4 +48,4 @@ class CPyDistributedTraining : public CDistributedTraining {
 	void Save( const std::string& path );
 };
 
-void InitializeDistributedTraining(py::module& m);
+void InitializeDistributedTraining( py::module& m );
diff --git a/NeoML/include/NeoML/Dnn/DnnDistributed.h b/NeoML/include/NeoML/Dnn/DnnDistributed.h
@@ -63,7 +63,7 @@ class NEOML_API CDistributedTraining {
 	virtual ~CDistributedTraining();
 
 	// Gets the number of models in disitrbuted traning
-	int GetModelCount() const { return cnns.Size(); }
+	int GetModelCount() const { return threadPool->Size(); }
 	// Sets the solver for all of the models
 	void SetSolver( CArchive& archive );
 	// Sets the learning rate for all of the models
@@ -101,27 +101,26 @@ class NEOML_API CDistributedTraining {
 
 private:
 	struct CThreadParams;
+	// Run neural networks passes types
+	enum class TRunType { Invalid, RunOnce, RunBackwardOnce, Train };
 
-	// Either multi-threads on a CPU or multi-devices GPU
-	const bool isCpu;
 	// If multi-threads on a CPU, it is an operator of worker threads
-	IThreadPool* const threadPool;
+	CPtrOwner<IThreadPool> threadPool;
+	// Params to transfer to all threads function
+	CPtrOwner<CThreadParams> threadParams;
 	// Separate mathEngine for each thread or device both for CPU and GPU training
 	// Cannot use CPointerArray, as CreateDistributedCpuMathEngines requires a raw array to initialize engines
 	CArray<IMathEngine*> mathEngines;
 	// Separate random generator for each dnn in a thread
 	CPointerArray<CRandom> rands;
 	// Separate dnn for each thread
 	CPointerArray<CDnn> cnns;
-	// Separate `batchSize` for each dnn (may be empty) in a thread
-	CArray<int> batchSize;
-	// `Train()` cannot be called if it `isFirstRun`
-	// `batchSize` may not be equal 0, if it `isFirstRun` for `RunOnce`, `RunAndBackwardOnce` or `RunAndLearnOnce`.
-	bool isFirstRun = true;
-	// Containers for errors if it happened
-	CArray<CString> errorMessages;
-
-	void initialize( CArchive& archive, int count, TDistributedInitializer initializer, int seed );
+
+	void initialize( CArchive& archive, int count,
+		TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
+	void serializeDnn( CDnn& dnn, int count,
+		TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
+	void run( IDistributedDataset*, TRunType );
 
 	friend class CLoraSerializer;
 };