Store copies large vectors in ActionWithVector so that we can avoid a lot of malloc and optimise the code

Gareth Aneurin Tribello · Gareth Aneurin Tribello · commit 86286159d421 · 2024-09-22T17:31:11.000+01:00
diff --git a/src/adjmat/TopologyMatrix.cpp b/src/adjmat/TopologyMatrix.cpp
@@ -127,7 +127,7 @@ double TopologyMatrix::calculateWeight( const Vector& pos1, const Vector& pos2,
   // Now run through all sea atoms
   HistogramBead bead; bead.isNotPeriodic(); bead.setKernelType( kerneltype );
   Vector g1derivf,g2derivf,lderivf; Tensor vir; double binlength = maxbins * binw_mat;
-  MultiValue tvals( maxbins, myvals.getNumberOfDerivatives() );
+  MultiValue tvals; tvals.resize( maxbins, myvals.getNumberOfDerivatives() );
   for(unsigned i=0; i<natoms; ++i) {
     // Position of sea atom (this will be the origin)
     Vector d2 = getPosition(i,myvals);
diff --git a/src/core/ActionWithVector.cpp b/src/core/ActionWithVector.cpp
@@ -191,6 +191,7 @@ void ActionWithVector::runAllTasks() {
   unsigned nt=OpenMP::getNumThreads();
   if( nt*stride*10>nactive_tasks ) nt=nactive_tasks/stride/10;
   if( nt==0 ) nt=1;
+  if( myvals.size()!=nt ) myvals.resize(nt);
 
   // Get the total number of streamed quantities that we need
   // Get size for buffer
@@ -210,28 +211,31 @@ void ActionWithVector::runAllTasks() {
 
   // Recover the number of derivatives we require
   if( !doNotCalculateDerivatives() && !gridsInStream ) {
+    unsigned nargs = getNumberOfArguments(); int nmasks=getNumberOfMasks();
+    if( nargs>=nmasks && nmasks>0 ) nargs = nargs - nmasks;
     if( getNumberOfAtoms()>0 ) nderivatives += 3*getNumberOfAtoms() + 9;
     for(unsigned i=0; i<getNumberOfArguments(); ++i) nderivatives += getPntrToArgument(i)->getNumberOfValues();
   }
 
   #pragma omp parallel num_threads(nt)
   {
     std::vector<double> omp_buffer;
+    const unsigned t=OpenMP::getThreadNum();
     if( nt>1 ) omp_buffer.resize( bufsize, 0.0 );
-    MultiValue myvals( getNumberOfComponents(), nderivatives, 0 );
-    myvals.clearAll();
+    if( myvals[t].getNumberOfValues()!=getNumberOfComponents() || myvals[t].getNumberOfDerivatives()!=nderivatives ) myvals[t].resize( getNumberOfComponents(), nderivatives );
+    myvals[t].clearAll();
 
     #pragma omp for nowait
     for(unsigned i=rank; i<nactive_tasks; i+=stride) {
       // Calculate the stuff in the loop for this action
-      runTask( partialTaskList[i], myvals );
+      runTask( partialTaskList[i], myvals[t] );
 
       // Now transfer the data to the actions that accumulate values from the calculated quantities
-      if( nt>1 ) gatherAccumulators( partialTaskList[i], myvals, omp_buffer );
-      else gatherAccumulators( partialTaskList[i], myvals, buffer );
+      if( nt>1 ) gatherAccumulators( partialTaskList[i], myvals[t], omp_buffer );
+      else gatherAccumulators( partialTaskList[i], myvals[t], buffer );
 
       // Clear the value
-      myvals.clearAll();
+      myvals[t].clearAll();
     }
     #pragma omp critical
     if( nt>1 ) for(unsigned i=0; i<bufsize; ++i) buffer[i]+=omp_buffer[i];
@@ -349,18 +353,14 @@ bool ActionWithVector::checkForForces() {
   unsigned nt=OpenMP::getNumThreads();
   if( nt*stride*10>nf_tasks ) nt=nf_tasks/stride/10;
   if( nt==0 ) nt=1;
+  if( myvals.size()!=nt ) myvals.resize(nt);
+  if( omp_forces.size()!=nt ) omp_forces.resize(nt);
 
-  // Now determine how big the multivalue needs to be
-  unsigned nmatrices=0; ActionWithMatrix* am=dynamic_cast<ActionWithMatrix*>(this);
-  if(am) {
-    for(unsigned i=0; i<getNumberOfComponents(); ++i) {
-      if( getConstPntrToComponent(i)->getRank()==2 && !getConstPntrToComponent(i)->hasDerivatives() ) { nmatrices=getNumberOfComponents(); }
-    }
-  }
   // Recover the number of derivatives we require (this should be equal to the number of forces)
-  unsigned nderiv=0;
+  unsigned nderiv=0, nargs = getNumberOfArguments(); int nmasks = getNumberOfMasks();
+  if( nargs>=nmasks && nmasks>0 ) nargs = nargs - nmasks;
   if( getNumberOfAtoms()>0 ) nderiv += 3*getNumberOfAtoms() + 9;
-  for(unsigned i=0; i<getNumberOfArguments(); ++i) {
+  for(unsigned i=0; i<nargs; ++i) {
     nderiv += getPntrToArgument(i)->getNumberOfStoredValues();
   }
   if( forcesForApply.size()!=nderiv ) forcesForApply.resize( nderiv );
@@ -369,23 +369,26 @@ bool ActionWithVector::checkForForces() {
 
   #pragma omp parallel num_threads(nt)
   {
-    std::vector<double> omp_forces;
-    if( nt>1 ) omp_forces.resize( forcesForApply.size(), 0.0 );
-    MultiValue myvals( getNumberOfComponents(), nderiv, nmatrices );
-    myvals.clearAll();
+    const unsigned t=OpenMP::getThreadNum();
+    if( nt>1 ) {
+      if( omp_forces[t].size()!=forcesForApply.size() ) omp_forces[t].resize( forcesForApply.size(), 0.0 );
+      else omp_forces[t].assign( forcesForApply.size(), 0.0 );
+    }
+    if( myvals[t].getNumberOfValues()!=getNumberOfComponents() || myvals[t].getNumberOfDerivatives()!=nderiv ) myvals[t].resize( getNumberOfComponents(), nderiv );
+    myvals[t].clearAll();
 
     #pragma omp for nowait
     for(unsigned i=rank; i<nf_tasks; i+=stride) {
-      runTask( force_tasks[i], myvals );
+      runTask( force_tasks[i], myvals[t] );
 
       // Now get the forces
-      if( nt>1 ) gatherForces( force_tasks[i], myvals, omp_forces );
-      else gatherForces( force_tasks[i], myvals, forcesForApply );
+      if( nt>1 ) gatherForces( force_tasks[i], myvals[t], omp_forces[t] );
+      else gatherForces( force_tasks[i], myvals[t], forcesForApply );
 
-      myvals.clearAll();
+      myvals[t].clearAll();
     }
     #pragma omp critical
-    if(nt>1) for(unsigned i=0; i<forcesForApply.size(); ++i) forcesForApply[i]+=omp_forces[i];
+    if(nt>1) for(unsigned i=0; i<forcesForApply.size(); ++i) forcesForApply[i]+=omp_forces[t][i];
   }
   // MPI Gather on forces
   if( !serial ) comm.Sum( forcesForApply );
diff --git a/src/core/ActionWithVector.h b/src/core/ActionWithVector.h
@@ -45,6 +45,10 @@ class ActionWithVector:
   bool forwardPass;
 /// The buffer that we use (we keep a copy here to avoid resizing)
   std::vector<double> buffer;
+/// A tempory vector of MultiValue so we can avoid doing lots of resizes
+  std::vector<MultiValue> myvals;
+/// A tempory set of vectors for holding forces over threads
+  std::vector<std::vector<double> > omp_forces;
 /// The list of active tasks
   std::vector<unsigned> active_tasks;
 /// Clear all the bookeeping arrays
diff --git a/src/function/FunctionOfVector.h b/src/function/FunctionOfVector.h
@@ -172,8 +172,9 @@ void FunctionOfVector<T>::prepare() {
 
 template <class T>
 void FunctionOfVector<T>::performTask( const unsigned& current, MultiValue& myvals ) const {
-  unsigned argstart=myfunc.getArgStart(); std::vector<double> args( getNumberOfArguments()-argstart);
-  for(unsigned i=argstart; i<getNumberOfArguments(); ++i) {
+  unsigned nargs=getNumberOfArguments(); if( getNumberOfMasks()>0 ) nargs = nargs - getNumberOfMasks();
+  unsigned argstart=myfunc.getArgStart(); std::vector<double> args( nargs-argstart);
+  for(unsigned i=argstart; i<nargs; ++i) {
     if( getPntrToArgument(i)->getRank()==1 ) args[i-argstart]=getPntrToArgument(i)->get(current);
     else args[i-argstart] = getPntrToArgument(i)->get();
   }
diff --git a/src/refdist/MatrixProductDiagonal.cpp b/src/refdist/MatrixProductDiagonal.cpp
@@ -127,7 +127,7 @@ void MatrixProductDiagonal::performTask( const unsigned& task_index, MultiValue&
 void MatrixProductDiagonal::calculate() {
   if( getPntrToArgument(1)->getRank()==1 ) {
     unsigned nder = getNumberOfDerivatives();
-    MultiValue myvals( 1, nder, 0 ); performTask( 0, myvals );
+    MultiValue myvals; myvals.resize( 1, nder ); performTask( 0, myvals );
 
     Value* myval=getPntrToComponent(0); myval->set( myvals.get(0) );
     for(unsigned i=0; i<nder; ++i) myval->setDerivative( i, myvals.getDerivative(0,i) );
diff --git a/src/secondarystructure/SecondaryStructureRMSD.cpp b/src/secondarystructure/SecondaryStructureRMSD.cpp
@@ -188,9 +188,6 @@ void SecondaryStructureRMSD::calculate() {
 }
 
 void SecondaryStructureRMSD::performTask( const unsigned& current, MultiValue& myvals ) const {
-  // Resize the derivatives if need be
-  unsigned nderi = 3*getNumberOfAtoms()+9;
-  if( myvals.getNumberOfDerivatives()!=nderi ) myvals.resize( myvals.getNumberOfValues(), nderi, 0 );
   // Retrieve the positions
   const unsigned natoms = colvar_atoms[current].size();
   std::vector<Vector> pos( natoms ), deriv( natoms );
diff --git a/src/tools/MultiValue.cpp b/src/tools/MultiValue.cpp
@@ -24,31 +24,11 @@
 
 namespace PLMD {
 
-MultiValue::MultiValue( const size_t& nvals, const size_t& nder, const size_t& nmat ):
-  task_index(0),
-  task2_index(0),
-  values(nvals),
-  nderivatives(nder),
-  derivatives(nvals*nder),
-  hasderiv(nvals*nder,false),
-  nactive(nvals),
-  active_list(nvals*nder),
-  atLeastOneSet(false),
-  vector_call(false),
-  nindices(0),
-  nsplit(0),
-  matrix_force_stash(0),
-  matrix_row_nderivatives(0),
-  matrix_row_derivative_indices(nder)
-{
-  if( nmat>0 ) matrix_force_stash.resize(nder,0);
-}
-
-void MultiValue::resize( const size_t& nvals, const size_t& nder, const size_t& nmat ) {
-  if( values.size()==nvals && nderivatives==nder ) return;
+void MultiValue::resize( const size_t& nvals, const size_t& nder ) {
+  if( values.size()==nvals && nderivatives>nder ) return;
   values.resize(nvals); nderivatives=nder; derivatives.resize( nvals*nder );
   hasderiv.resize(nvals*nder,false); nactive.resize(nvals); active_list.resize(nvals*nder);
-  if( nmat>0 ) matrix_force_stash.resize(nder,0);
+  matrix_force_stash.resize(nder,0);
   matrix_row_nderivatives=0; matrix_row_derivative_indices.resize(nder); atLeastOneSet=false;
 }
 
diff --git a/src/tools/MultiValue.h b/src/tools/MultiValue.h
@@ -62,8 +62,8 @@ class MultiValue {
   std::vector<Tensor> tmp_atom_virial;
   std::vector<std::vector<double> > tmp_vectors;
 public:
-  MultiValue( const std::size_t& nvals, const std::size_t& nder, const std::size_t& nmat=0 );
-  void resize( const std::size_t& nvals, const std::size_t& nder, const std::size_t& nmat=0 );
+  MultiValue() : task_index(0), task2_index(0), nderivatives(0), atLeastOneSet(false), vector_call(false), nindices(0), nsplit(0), matrix_row_nderivatives(0) {}
+  void resize( const std::size_t& nvals, const std::size_t& nder );
 /// Set the task index prior to the loop
   void setTaskIndex( const std::size_t& tindex );
 ///

Original file line number	Diff line number	Diff line change
`@@ -188,9 +188,6 @@ void SecondaryStructureRMSD::calculate() {`
`188`	`188`	`}`
`189`	`189`
`190`	`190`	`void SecondaryStructureRMSD::performTask( const unsigned& current, MultiValue& myvals ) const {`
`191`		`- // Resize the derivatives if need be`
`192`		`- unsigned nderi = 3*getNumberOfAtoms()+9;`
`193`		`- if( myvals.getNumberOfDerivatives()!=nderi ) myvals.resize( myvals.getNumberOfValues(), nderi, 0 );`
`194`	`191`	`// Retrieve the positions`
`195`	`192`	`const unsigned natoms = colvar_atoms[current].size();`
`196`	`193`	`std::vector<Vector> pos( natoms ), deriv( natoms );`