Skip to content

Feature: Implement stress_ewa_op for sincos parallel #6302

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: LTS
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1136,6 +1136,97 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;

// CUDA kernel for stress Ewald sincos calculation
// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
template <typename FPTYPE>
__global__ void cal_stress_ewa_sincos_kernel(
const int nat,
const int npw,
const int ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag)
{
const FPTYPE TWO_PI = 2.0 * M_PI;

// Parallelize over G-vectors (ig) instead of atoms (iat)
const int ig = blockIdx.x * blockDim.x + threadIdx.x;

if (ig >= npw) return;

// Skip G=0 term
if (ig == ig_gge0) return;

// Local accumulation variables for this G-vector
FPTYPE local_rhostar_real = 0.0;
FPTYPE local_rhostar_imag = 0.0;

// Load G-vector components to registers
const FPTYPE gcar_x = gcar[ig * 3 + 0];
const FPTYPE gcar_y = gcar[ig * 3 + 1];
const FPTYPE gcar_z = gcar[ig * 3 + 2];

// Loop over all atoms for this G-vector (matches CPU implementation)
for (int iat = 0; iat < nat; iat++) {
// Load atom information to registers
const FPTYPE tau_x = tau[iat * 3 + 0];
const FPTYPE tau_y = tau[iat * 3 + 1];
const FPTYPE tau_z = tau[iat * 3 + 2];
const FPTYPE zv = zv_facts[iat];

// Calculate phase: 2π * (G · τ) - same as CPU implementation
const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
gcar_y * tau_y +
gcar_z * tau_z);

// Use CUDA intrinsic for sincos
FPTYPE sinp, cosp;
sincos(phase, &sinp, &cosp);

// Accumulate structure factor locally (no race conditions)
local_rhostar_real += zv * cosp;
local_rhostar_imag += zv * sinp;
}

// Store results - each thread writes to unique memory location
rhostar_real[ig] = local_rhostar_real;
rhostar_imag[ig] = local_rhostar_imag;
}

// GPU operators
template <typename FPTYPE>
void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const base_device::DEVICE_GPU* ctx,
const int& nat,
const int& npw,
const int& ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag)
{

// Calculate grid configuration for G-vector parallelization
const int threads_per_block = THREADS_PER_BLOCK;
const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;

dim3 grid(num_blocks);
dim3 block(threads_per_block);

cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(
nat, npw, ig_gge0, gcar, tau, zv_facts,
rhostar_real, rhostar_imag
);

cudaCheckOnDebug();
}

template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;

// template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
// template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
} // namespace hamilt
92 changes: 92 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1117,4 +1117,96 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;

template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;

// HIP kernel for stress Ewald sincos calculation
// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
template <typename FPTYPE>
__global__ void cal_stress_ewa_sincos_kernel(
const int nat,
const int npw,
const int ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag)
{
const FPTYPE TWO_PI = 2.0 * M_PI;

// Parallelize over G-vectors (ig) instead of atoms (iat)
const int ig = blockIdx.x * blockDim.x + threadIdx.x;

if (ig >= npw) return;

// Skip G=0 term
if (ig == ig_gge0) return;

// Local accumulation variables for this G-vector
FPTYPE local_rhostar_real = 0.0;
FPTYPE local_rhostar_imag = 0.0;

// Load G-vector components to registers
const FPTYPE gcar_x = gcar[ig * 3 + 0];
const FPTYPE gcar_y = gcar[ig * 3 + 1];
const FPTYPE gcar_z = gcar[ig * 3 + 2];

// Loop over all atoms for this G-vector (matches CPU implementation)
for (int iat = 0; iat < nat; iat++) {
// Load atom information to registers
const FPTYPE tau_x = tau[iat * 3 + 0];
const FPTYPE tau_y = tau[iat * 3 + 1];
const FPTYPE tau_z = tau[iat * 3 + 2];
const FPTYPE zv = zv_facts[iat];

// Calculate phase: 2π * (G · τ) - same as CPU implementation
const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
gcar_y * tau_y +
gcar_z * tau_z);

// Use HIP intrinsic for sincos
FPTYPE sinp, cosp;
sincos(phase, &sinp, &cosp);

// Accumulate structure factor locally (no race conditions)
local_rhostar_real += zv * cosp;
local_rhostar_imag += zv * sinp;
}

// Store results - each thread writes to unique memory location
rhostar_real[ig] = local_rhostar_real;
rhostar_imag[ig] = local_rhostar_imag;
}

// GPU operators
template <typename FPTYPE>
void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
const base_device::DEVICE_GPU* ctx,
const int& nat,
const int& npw,
const int& ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag)
{
// Calculate grid configuration for G-vector parallelization
const int threads_per_block = THREADS_PER_BLOCK;
const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;

// Use 1D grid since we're parallelizing over G-vectors only
dim3 grid(num_blocks);
dim3 block(threads_per_block);

hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,
nat, npw, ig_gge0, gcar, tau, zv_facts,
rhostar_real, rhostar_imag
);

hipCheckOnDebug();
}

template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;

} // namespace hamilt
58 changes: 58 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,64 @@ template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;
template struct cal_multi_dot_op<float, base_device::DEVICE_CPU>;
template struct cal_multi_dot_op<double, base_device::DEVICE_CPU>;

// CPU implementation of Ewald stress sincos operator
template <typename FPTYPE>
struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_CPU>
{
void operator()(const base_device::DEVICE_CPU* ctx,
const int& nat,
const int& npw,
const int& ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag)
{
const FPTYPE TWO_PI = 2.0 * M_PI;

// Note: Arrays are already initialized to zero in the calling function
// No need to initialize again here to avoid redundant operations

#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int ig = 0; ig < npw; ig++) {
if (ig == ig_gge0) continue; // Skip G=0

FPTYPE local_rhostar_real = 0.0;
FPTYPE local_rhostar_imag = 0.0;

// Double loop: iat -> ig (as requested)
for (int iat = 0; iat < nat; iat++) {
const FPTYPE tau_x = tau[iat * 3 + 0];
const FPTYPE tau_y = tau[iat * 3 + 1];
const FPTYPE tau_z = tau[iat * 3 + 2];
const FPTYPE zv = zv_facts[iat];

// Calculate phase: 2π * (G · τ) - similar to cal_force_ewa phase
const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
gcar[ig * 3 + 1] * tau_y +
gcar[ig * 3 + 2] * tau_z);

// Calculate sincos
FPTYPE sinp, cosp;
ModuleBase::libm::sincos(phase, &sinp, &cosp);

// Accumulate structure factor
local_rhostar_real += zv * cosp;
local_rhostar_imag += zv * sinp;
}

// Store results
rhostar_real[ig] = local_rhostar_real;
rhostar_imag[ig] = local_rhostar_imag;
}
}
};

template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_CPU>;
template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_CPU>;

// template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_CPU>;
// template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_CPU>;
Expand Down
43 changes: 43 additions & 0 deletions source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,35 @@ struct cal_multi_dot_op{
const std::complex<FPTYPE>* psi);
};

template <typename FPTYPE, typename Device>
struct cal_stress_ewa_sincos_op
{
/// @brief Calculate Ewald stress sincos computation
/// Only computes rhostar, other calculations remain in original function
///
/// Input Parameters
/// @param ctx - which device this function runs on
/// @param nat - total number of atoms
/// @param npw - number of plane waves
/// @param ig_gge0 - index of G=0 vector (-1 if not present)
/// @param gcar - G-vector Cartesian coordinates [npw * 3]
/// @param tau - atomic positions [nat * 3]
/// @param zv_facts - precomputed zv factors for each atom [nat]
///
/// Output Parameters
/// @param rhostar_real - real part of structure factor [npw]
/// @param rhostar_imag - imaginary part of structure factor [npw]
void operator()(const Device* ctx,
const int& nat,
const int& npw,
const int& ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag);
};


#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
template <typename FPTYPE>
Expand Down Expand Up @@ -434,6 +463,20 @@ struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>{
const std::complex<FPTYPE>* psi);
};

template <typename FPTYPE>
struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>
{
void operator()(const base_device::DEVICE_GPU* ctx,
const int& nat,
const int& npw,
const int& ig_gge0,
const FPTYPE* gcar,
const FPTYPE* tau,
const FPTYPE* zv_facts,
FPTYPE* rhostar_real,
FPTYPE* rhostar_imag);
};

/**
* The operator is used to compute the auxiliary amount of stress /force
* in parallel on the GPU. They identify type with the type provided and
Expand Down
Loading
Loading