Skip to content

Commit

Permalink
Improve the CPU performance by increasing the iterator directly (#710)
Browse files Browse the repository at this point in the history
* reorder the loops for better memory reuse

* typo fix

* bug fix for index

* format change

* using iterator

* using *(++) by matt

* Revert "using *(++) by matt"

This reverts commit 2e9ed70.

* merge two loops in Jacobian

* use *(++) again

* typo fix

* more bug fix

* break two loops for jacobian

* break forcing loops into two

* use const loop limit

* use *(++) for inline lu decomp mozart

* optimize inline linear solver

* update inline LU doolittle decomp
  • Loading branch information
sjsprecious authored Jan 24, 2025
1 parent c2192b7 commit b9264b2
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 32 deletions.
62 changes: 44 additions & 18 deletions include/micm/process/process_set.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,19 +288,36 @@ namespace micm
const std::size_t offset_state = i_group * state_variables.GroupSize();
const std::size_t offset_forcing = i_group * forcing.GroupSize();
std::vector<double> rate(L, 0);
for (std::size_t i_rxn = 0; i_rxn < number_of_reactants_.size(); ++i_rxn)
const std::size_t number_of_reactions = number_of_reactants_.size();
for (std::size_t i_rxn = 0; i_rxn < number_of_reactions; ++i_rxn)
{
const auto v_rate_subrange_begin = v_rate_constants_begin + offset_rc + (i_rxn * L);
rate.assign(v_rate_subrange_begin, v_rate_subrange_begin + L);
for (std::size_t i_react = 0; i_react < number_of_reactants_[i_rxn]; ++i_react)
const std::size_t number_of_reactants = number_of_reactants_[i_rxn];
for (std::size_t i_react = 0; i_react < number_of_reactants; ++i_react)
{
std::size_t idx_state_variables = offset_state + react_id[i_react] * L;
auto rate_it = rate.begin();
auto v_state_variables_it = v_state_variables.begin() + idx_state_variables;
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
rate[i_cell] *= v_state_variables[offset_state + react_id[i_react] * L + i_cell];
for (std::size_t i_react = 0; i_react < number_of_reactants_[i_rxn]; ++i_react)
*(rate_it++) *= *(v_state_variables_it++);
}
for (std::size_t i_react = 0; i_react < number_of_reactants; ++i_react)
{
auto v_forcing_it = v_forcing.begin() + offset_forcing + react_id[i_react] * L;
auto rate_it = rate.begin();
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
v_forcing[offset_forcing + react_id[i_react] * L + i_cell] -= rate[i_cell];
for (std::size_t i_prod = 0; i_prod < number_of_products_[i_rxn]; ++i_prod)
*(v_forcing_it++) -= *(rate_it++);
}
const std::size_t number_of_products = number_of_products_[i_rxn];
for (std::size_t i_prod = 0; i_prod < number_of_products; ++i_prod)
{
auto v_forcing_it = v_forcing.begin() + offset_forcing + prod_id[i_prod] * L;
auto rate_it = rate.begin();
auto yield_value = yield[i_prod];
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
v_forcing[offset_forcing + prod_id[i_prod] * L + i_cell] += yield[i_prod] * rate[i_cell];
*(v_forcing_it++) += yield_value * *(rate_it++);
}
react_id += number_of_reactants_[i_rxn];
prod_id += number_of_products_[i_rxn];
yield += number_of_products_[i_rxn];
Expand Down Expand Up @@ -388,33 +405,42 @@ namespace micm
const std::size_t offset_state = i_group * state_variables.GroupSize();
const std::size_t offset_jacobian = i_group * jacobian.GroupSize();
auto flat_id = jacobian_flat_ids_.begin();

for (std::size_t i_rxn = 0; i_rxn < number_of_reactants_.size(); ++i_rxn)
const std::size_t number_of_reactions = number_of_reactants_.size();
for (std::size_t i_rxn = 0; i_rxn < number_of_reactions; ++i_rxn)
{
for (std::size_t i_ind = 0; i_ind < number_of_reactants_[i_rxn]; ++i_ind)
{
auto v_rate_subrange_begin = v_rate_constants_begin + offset_rc + (i_rxn * L);
auto v_rate_subrange_begin = v_rate_constants_begin + offset_rc + (i_rxn * L);
const std::size_t number_of_reactants = number_of_reactants_[i_rxn];
const std::size_t number_of_products = number_of_products_[i_rxn];
for (std::size_t i_ind = 0; i_ind < number_of_reactants; ++i_ind)
{
d_rate_d_ind.assign(v_rate_subrange_begin, v_rate_subrange_begin + L);
for (std::size_t i_react = 0; i_react < number_of_reactants_[i_rxn]; ++i_react)
for (std::size_t i_react = 0; i_react < number_of_reactants; ++i_react)
{
if (i_react == i_ind)
{
continue;
}
const std::size_t idx_state_variables = offset_state + (react_id[i_react] * L);
auto v_state_variables_it = v_state_variables.begin() + idx_state_variables;
auto d_rate_d_ind_it = d_rate_d_ind.begin();
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
d_rate_d_ind[i_cell] *= v_state_variables[idx_state_variables + i_cell];
*(d_rate_d_ind_it++) *= *(v_state_variables_it++);
}
for (std::size_t i_dep = 0; i_dep < number_of_reactants_[i_rxn]; ++i_dep)
for (std::size_t i_dep = 0; i_dep < number_of_reactants; ++i_dep)
{
auto v_jacobian_it = v_jacobian.begin() + offset_jacobian + *flat_id;
auto d_rate_d_ind_it = d_rate_d_ind.begin();
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
v_jacobian[offset_jacobian + *flat_id + i_cell] += d_rate_d_ind[i_cell];
*(v_jacobian_it++) += *(d_rate_d_ind_it++);
++flat_id;
}
for (std::size_t i_dep = 0; i_dep < number_of_products_[i_rxn]; ++i_dep)
for (std::size_t i_dep = 0; i_dep < number_of_products; ++i_dep)
{
auto v_jacobian_it = v_jacobian.begin() + offset_jacobian + *flat_id;
auto d_rate_d_ind_it = d_rate_d_ind.begin();
auto yield_value = yield[i_dep];
for (std::size_t i_cell = 0; i_cell < L; ++i_cell)
v_jacobian[offset_jacobian + *flat_id + i_cell] -= yield[i_dep] * d_rate_d_ind[i_cell];
*(v_jacobian_it++) -= yield_value * *(d_rate_d_ind_it++);
++flat_id;
}
}
Expand Down
14 changes: 11 additions & 3 deletions include/micm/solver/linear_solver_in_place.inl
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,11 @@ namespace micm
{
const std::size_t Lij_yj_first = (*Lij_yj).first;
const std::size_t Lij_yj_second_times_n_cells = (*Lij_yj).second * n_cells;
auto LU_group_it = LU_group + Lij_yj_first;
auto x_group_it = x_group + Lij_yj_second_times_n_cells;
auto y_elem_it = y_elem;
for (std::size_t i_cell = 0; i_cell < n_cells; ++i_cell)
y_elem[i_cell] -= LU_group[Lij_yj_first + i_cell] * x_group[Lij_yj_second_times_n_cells + i_cell];
*(y_elem_it++) -= *(LU_group_it++) * *(x_group_it++);
++Lij_yj;
}
y_elem += n_cells;
Expand All @@ -160,13 +163,18 @@ namespace micm
{
const std::size_t Uij_xj_first = (*Uij_xj).first;
const std::size_t Uij_xj_second_times_n_cells = (*Uij_xj).second * n_cells;
auto LU_group_it = LU_group + Uij_xj_first;
auto x_group_it = x_group + Uij_xj_second_times_n_cells;
auto x_elem_it = x_elem;
for (std::size_t i_cell = 0; i_cell < n_cells; ++i_cell)
x_elem[i_cell] -= LU_group[Uij_xj_first + i_cell] * x_group[Uij_xj_second_times_n_cells + i_cell];
*(x_elem_it++) -= *(LU_group_it++) * *(x_group_it++);
++Uij_xj;
}
const std::size_t nUij_Uii_second = nUij_Uii.second;
auto LU_group_it = LU_group + nUij_Uii_second;
auto x_elem_it = x_elem;
for (std::size_t i_cell = 0; i_cell < n_cells; ++i_cell)
x_elem[i_cell] /= LU_group[nUij_Uii_second + i_cell];
*(x_elem_it++) /= *(LU_group_it++);

// don't iterate before the beginning of the vector
const std::size_t x_elem_distance = std::distance(x.AsVector().begin(), x_elem);
Expand Down
26 changes: 19 additions & 7 deletions include/micm/solver/lu_decomposition_doolittle_in_place.inl
Original file line number Diff line number Diff line change
Expand Up @@ -191,26 +191,38 @@ namespace micm
auto akj_aji = akj_aji_.begin();
for (const auto& nik_nki_aii : nik_nki_aii_)
{
for (std::size_t ik = 0; ik < std::get<0>(nik_nki_aii); ++ik)
const std::size_t ik_limit = std::get<0>(nik_nki_aii);
for (std::size_t ik = 0; ik < ik_limit; ++ik)
{
for (std::size_t jk = 0; jk < aik_njk->second; ++jk)
const std::size_t jk_limit = aik_njk->second;
for (std::size_t jk = 0; jk < jk_limit; ++jk)
{
auto ALU_vector_aik_njk_it = ALU_vector + aik_njk->first;
auto ALU_vector_aij_ajk_first_it = ALU_vector + aij_ajk->first;
auto ALU_vector_aij_ajk_second_it = ALU_vector + aij_ajk->second;
for (std::size_t i = 0; i < n_cells; ++i)
ALU_vector[aik_njk->first + i] -= ALU_vector[aij_ajk->first + i] * ALU_vector[aij_ajk->second + i];
*(ALU_vector_aik_njk_it++) -= *(ALU_vector_aij_ajk_first_it++) * *(ALU_vector_aij_ajk_second_it++);
++aij_ajk;
}
++aik_njk;
}
for (std::size_t ki = 0; ki < std::get<1>(nik_nki_aii); ++ki)
const std::size_t ki_limit = std::get<1>(nik_nki_aii);
for (std::size_t ki = 0; ki < ki_limit; ++ki)
{
for (std::size_t ji = 0; ji < aki_nji->second; ++ji)
const std::size_t ji_limit = aki_nji->second;
for (std::size_t ji = 0; ji < ji_limit; ++ji)
{
auto ALU_vector_aki_nji_it = ALU_vector + aki_nji->first;
auto ALU_vector_akj_aji_first_it = ALU_vector + akj_aji->first;
auto ALU_vector_akj_aji_second_it = ALU_vector + akj_aji->second;
for (std::size_t i = 0; i < n_cells; ++i)
ALU_vector[aki_nji->first + i] -= ALU_vector[akj_aji->first + i] * ALU_vector[akj_aji->second + i];
*(ALU_vector_aki_nji_it++) -= *(ALU_vector_akj_aji_first_it++) * *(ALU_vector_akj_aji_second_it++);
++akj_aji;
}
auto ALU_vector_aki_nji_it = ALU_vector + aki_nji->first;
auto ALU_vector_nik_nki_aii_it = ALU_vector + std::get<2>(nik_nki_aii);
for (std::size_t i = 0; i < n_cells; ++i)
ALU_vector[aki_nji->first + i] /= ALU_vector[std::get<2>(nik_nki_aii) + i];
*(ALU_vector_aki_nji_it++) /= *(ALU_vector_nik_nki_aii_it++);
++aki_nji;
}
}
Expand Down
15 changes: 11 additions & 4 deletions include/micm/solver/lu_decomposition_mozart_in_place.inl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ namespace micm
const std::size_t ALU_BlockSize = ALU.NumberOfBlocks();
constexpr std::size_t ALU_GroupVectorSize = SparseMatrixPolicy::GroupVectorSize();
const std::size_t ALU_GroupSizeOfFlatBlockSize = ALU.GroupSize();
double Aii_inverse[ALU_GroupVectorSize];
std::vector<double>Aii_inverse(ALU_GroupVectorSize);

// Loop over groups of blocks
for (std::size_t i_group = 0; i_group < ALU.NumberOfGroups(ALU_BlockSize); ++i_group)
Expand All @@ -160,21 +160,28 @@ namespace micm
auto ajk_aji = ajk_aji_.begin();
for (const auto& aii_nji_nki : aii_nji_nki_)
{
auto Aii_inverse_it = Aii_inverse.begin();
auto ALU_vector_it = ALU_vector + std::get<0>(aii_nji_nki);
for (std::size_t i = 0; i < n_cells; ++i)
Aii_inverse[i] = 1.0 / ALU_vector[std::get<0>(aii_nji_nki) + i];
*(Aii_inverse_it++) = 1.0 / *(ALU_vector_it++);
for (std::size_t ij = 0; ij < std::get<1>(aii_nji_nki); ++ij)
{
auto ALU_vector_it = ALU_vector + *aji;
auto Aii_inverse_it = Aii_inverse.begin();
for (std::size_t i = 0; i < n_cells; ++i)
ALU_vector[*aji + i] *= Aii_inverse[i];
*(ALU_vector_it++) *= *(Aii_inverse_it++);
++aji;
}
for (std::size_t ik = 0; ik < std::get<2>(aii_nji_nki); ++ik)
{
const std::size_t aik = std::get<0>(*aik_njk);
for (std::size_t ijk = 0; ijk < std::get<1>(*aik_njk); ++ijk)
{
auto ALU_vector_first_it = ALU_vector + ajk_aji->first;
auto ALU_vector_second_it = ALU_vector + ajk_aji->second;
auto ALU_vector_aik_it = ALU_vector + aik;
for (std::size_t i = 0; i < n_cells; ++i)
ALU_vector[ajk_aji->first + i] -= ALU_vector[ajk_aji->second + i] * ALU_vector[aik + i];
*(ALU_vector_first_it++) -= *(ALU_vector_second_it++) * *(ALU_vector_aik_it++);
++ajk_aji;
}
++aik_njk;
Expand Down

0 comments on commit b9264b2

Please sign in to comment.