Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion NeoML/Python/src/PyDnnDistributed.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Expand Down
16 changes: 9 additions & 7 deletions NeoML/Python/src/PyDnnDistributed.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Expand All @@ -20,7 +21,7 @@ limitations under the License.

class CPyDistributedDataset : public IDistributedDataset {
public:
CPyDistributedDataset( const py::object& data ) : getData( data ) {};
CPyDistributedDataset( const py::object& data ) : getData( data ) {}
int SetInputBatch( CDnn& dnn, int thread ) override;
private:
py::object getData;
Expand All @@ -29,13 +30,14 @@ class CPyDistributedDataset : public IDistributedDataset {
class CPyDistributedTraining : public CDistributedTraining {
public:
CPyDistributedTraining( CDnn& dnn, int count, TDistributedInitializer initializer, int seed )
: CDistributedTraining( dnn, count, initializer, seed ) {};
: CDistributedTraining( dnn, count, initializer, seed ) {}
CPyDistributedTraining( CArchive& archive, int count, TDistributedInitializer initializer, int seed )
: CDistributedTraining( archive, count, initializer, seed ) {};
: CDistributedTraining( archive, count, initializer, seed ) {}
CPyDistributedTraining( CDnn& dnn, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {};
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {}
CPyDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {};
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {}

void Run( const py::object& data );
void RunAndBackward( const py::object& data );
void Learn( const py::object& data );
Expand All @@ -46,4 +48,4 @@ class CPyDistributedTraining : public CDistributedTraining {
void Save( const std::string& path );
};

void InitializeDistributedTraining(py::module& m);
void InitializeDistributedTraining( py::module& m );
25 changes: 12 additions & 13 deletions NeoML/include/NeoML/Dnn/DnnDistributed.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class NEOML_API CDistributedTraining {
virtual ~CDistributedTraining();

// Gets the number of models in disitrbuted traning
int GetModelCount() const { return cnns.Size(); }
int GetModelCount() const { return threadPool->Size(); }
// Sets the solver for all of the models
void SetSolver( CArchive& archive );
// Sets the learning rate for all of the models
Expand Down Expand Up @@ -101,27 +101,26 @@ class NEOML_API CDistributedTraining {

private:
struct CThreadParams;
// Run neural networks passes types
enum class TRunType { Invalid, RunOnce, RunBackwardOnce, Train };

// Either multi-threads on a CPU or multi-devices GPU
const bool isCpu;
// If multi-threads on a CPU, it is an operator of worker threads
IThreadPool* const threadPool;
CPtrOwner<IThreadPool> threadPool;
// Params to transfer to all threads function
CPtrOwner<CThreadParams> threadParams;
// Separate mathEngine for each thread or device both for CPU and GPU training
// Cannot use CPointerArray, as CreateDistributedCpuMathEngines requires a raw array to initialize engines
CArray<IMathEngine*> mathEngines;
// Separate random generator for each dnn in a thread
CPointerArray<CRandom> rands;
// Separate dnn for each thread
CPointerArray<CDnn> cnns;
// Separate `batchSize` for each dnn (may be empty) in a thread
CArray<int> batchSize;
// `Train()` cannot be called if it `isFirstRun`
// `batchSize` may not be equal 0, if it `isFirstRun` for `RunOnce`, `RunAndBackwardOnce` or `RunAndLearnOnce`.
bool isFirstRun = true;
// Containers for errors if it happened
CArray<CString> errorMessages;

void initialize( CArchive& archive, int count, TDistributedInitializer initializer, int seed );

void initialize( CArchive& archive, int count,
TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
void serializeDnn( CDnn& dnn, int count,
TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
void run( IDistributedDataset*, TRunType );

friend class CLoraSerializer;
};
Expand Down
Loading