deepmodeling · jieli-matrix · Jun 17, 2025 · Jun 17, 2025 · Jun 25, 2025
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -1136,6 +1136,97 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
 
+// CUDA kernel for stress Ewald sincos calculation
+// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    // Parallelize over G-vectors (ig) instead of atoms (iat)
+    const int ig = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (ig >= npw) return;
+
+    // Skip G=0 term
+    if (ig == ig_gge0) return;
+
+    // Local accumulation variables for this G-vector
+    FPTYPE local_rhostar_real = 0.0;
+    FPTYPE local_rhostar_imag = 0.0;
+
+    // Load G-vector components to registers
+    const FPTYPE gcar_x = gcar[ig * 3 + 0];
+    const FPTYPE gcar_y = gcar[ig * 3 + 1];
+    const FPTYPE gcar_z = gcar[ig * 3 + 2];
+
+    // Loop over all atoms for this G-vector (matches CPU implementation)
+    for (int iat = 0; iat < nat; iat++) {
+        // Load atom information to registers
+        const FPTYPE tau_x = tau[iat * 3 + 0];
+        const FPTYPE tau_y = tau[iat * 3 + 1];
+        const FPTYPE tau_z = tau[iat * 3 + 2];
+        const FPTYPE zv = zv_facts[iat];
+
+        // Calculate phase: 2π * (G · τ) - same as CPU implementation
+        const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
+                                       gcar_y * tau_y +
+                                       gcar_z * tau_z);
+
+        // Use CUDA intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Accumulate structure factor locally (no race conditions)
+        local_rhostar_real += zv * cosp;
+        local_rhostar_imag += zv * sinp;
+    }
+
+    // Store results - each thread writes to unique memory location
+    rhostar_real[ig] = local_rhostar_real;
+    rhostar_imag[ig] = local_rhostar_imag;
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+
+    // Calculate grid configuration for G-vector parallelization
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
+
+    dim3 grid(num_blocks);
+    dim3 block(threads_per_block);
+
+    cal_stress_ewa_sincos_kernel<FPTYPE><<<grid, block>>>(
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    cudaCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_GPU>;
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_GPU>;
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -1117,4 +1117,96 @@ template struct cal_force_npw_op<float, base_device::DEVICE_GPU>;
 
 template struct cal_multi_dot_op<double, base_device::DEVICE_GPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_GPU>;
+
+// HIP kernel for stress Ewald sincos calculation
+// Fixed: Parallelize over G-vectors instead of atoms to avoid race conditions
+template <typename FPTYPE>
+__global__ void cal_stress_ewa_sincos_kernel(
+    const int nat,
+    const int npw,
+    const int ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    const FPTYPE TWO_PI = 2.0 * M_PI;
+
+    // Parallelize over G-vectors (ig) instead of atoms (iat)
+    const int ig = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (ig >= npw) return;
+
+    // Skip G=0 term
+    if (ig == ig_gge0) return;
+
+    // Local accumulation variables for this G-vector
+    FPTYPE local_rhostar_real = 0.0;
+    FPTYPE local_rhostar_imag = 0.0;
+
+    // Load G-vector components to registers
+    const FPTYPE gcar_x = gcar[ig * 3 + 0];
+    const FPTYPE gcar_y = gcar[ig * 3 + 1];
+    const FPTYPE gcar_z = gcar[ig * 3 + 2];
+
+    // Loop over all atoms for this G-vector (matches CPU implementation)
+    for (int iat = 0; iat < nat; iat++) {
+        // Load atom information to registers
+        const FPTYPE tau_x = tau[iat * 3 + 0];
+        const FPTYPE tau_y = tau[iat * 3 + 1];
+        const FPTYPE tau_z = tau[iat * 3 + 2];
+        const FPTYPE zv = zv_facts[iat];
+
+        // Calculate phase: 2π * (G · τ) - same as CPU implementation
+        const FPTYPE phase = TWO_PI * (gcar_x * tau_x +
+                                       gcar_y * tau_y +
+                                       gcar_z * tau_z);
+
+        // Use HIP intrinsic for sincos
+        FPTYPE sinp, cosp;
+        sincos(phase, &sinp, &cosp);
+
+        // Accumulate structure factor locally (no race conditions)
+        local_rhostar_real += zv * cosp;
+        local_rhostar_imag += zv * sinp;
+    }
+
+    // Store results - each thread writes to unique memory location
+    rhostar_real[ig] = local_rhostar_real;
+    rhostar_imag[ig] = local_rhostar_imag;
+}
+
+// GPU operators
+template <typename FPTYPE>
+void cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>::operator()(
+    const base_device::DEVICE_GPU* ctx,
+    const int& nat,
+    const int& npw,
+    const int& ig_gge0,
+    const FPTYPE* gcar,
+    const FPTYPE* tau,
+    const FPTYPE* zv_facts,
+    FPTYPE* rhostar_real,
+    FPTYPE* rhostar_imag)
+{
+    // Calculate grid configuration for G-vector parallelization
+    const int threads_per_block = THREADS_PER_BLOCK;
+    const int num_blocks = (npw + threads_per_block - 1) / threads_per_block;
+
+    // Use 1D grid since we're parallelizing over G-vectors only
+    dim3 grid(num_blocks);
+    dim3 block(threads_per_block);
+
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_ewa_sincos_kernel<FPTYPE>), grid, block, 0, 0,
+        nat, npw, ig_gge0, gcar, tau, zv_facts,
+        rhostar_real, rhostar_imag
+    );
+
+    hipCheckOnDebug();
+}
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_GPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_GPU>;
+
 }  // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -759,6 +759,64 @@ template struct cal_force_npw_op<double, base_device::DEVICE_CPU>;
 template struct cal_multi_dot_op<float, base_device::DEVICE_CPU>;
 template struct cal_multi_dot_op<double, base_device::DEVICE_CPU>;
 
+// CPU implementation of Ewald stress sincos operator
+template <typename FPTYPE>
+struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag)
+    {
+        const FPTYPE TWO_PI = 2.0 * M_PI;
+
+        // Note: Arrays are already initialized to zero in the calling function
+        // No need to initialize again here to avoid redundant operations
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+        for (int ig = 0; ig < npw; ig++) {
+            if (ig == ig_gge0) continue; // Skip G=0
+
+            FPTYPE local_rhostar_real = 0.0;
+            FPTYPE local_rhostar_imag = 0.0;
+
+            // Double loop: iat -> ig (as requested)
+            for (int iat = 0; iat < nat; iat++) {
+                const FPTYPE tau_x = tau[iat * 3 + 0];
+                const FPTYPE tau_y = tau[iat * 3 + 1];
+                const FPTYPE tau_z = tau[iat * 3 + 2];
+                const FPTYPE zv = zv_facts[iat];
+
+                // Calculate phase: 2π * (G · τ) - similar to cal_force_ewa phase
+                const FPTYPE phase = TWO_PI * (gcar[ig * 3 + 0] * tau_x +
+                                               gcar[ig * 3 + 1] * tau_y +
+                                               gcar[ig * 3 + 2] * tau_z);
+
+                // Calculate sincos
+                FPTYPE sinp, cosp;
+                ModuleBase::libm::sincos(phase, &sinp, &cosp);
+
+                // Accumulate structure factor
+                local_rhostar_real += zv * cosp;
+                local_rhostar_imag += zv * sinp;
+            }
+
+            // Store results
+            rhostar_real[ig] = local_rhostar_real;
+            rhostar_imag[ig] = local_rhostar_imag;
+        }
+    }
+};
+
+template struct cal_stress_ewa_sincos_op<float, base_device::DEVICE_CPU>;
+template struct cal_stress_ewa_sincos_op<double, base_device::DEVICE_CPU>;
 
 // template struct prepare_vkb_deri_ptr_op<float, base_device::DEVICE_CPU>;
 // template struct prepare_vkb_deri_ptr_op<double, base_device::DEVICE_CPU>;

diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -260,6 +260,35 @@ struct cal_multi_dot_op{
                     const std::complex<FPTYPE>* psi);
 };
 
+template <typename FPTYPE, typename Device>
+struct cal_stress_ewa_sincos_op
+{
+    /// @brief Calculate Ewald stress sincos computation
+    /// Only computes rhostar, other calculations remain in original function
+    ///
+    /// Input Parameters
+    /// @param ctx - which device this function runs on
+    /// @param nat - total number of atoms
+    /// @param npw - number of plane waves
+    /// @param ig_gge0 - index of G=0 vector (-1 if not present)
+    /// @param gcar - G-vector Cartesian coordinates [npw * 3]
+    /// @param tau - atomic positions [nat * 3]
+    /// @param zv_facts - precomputed zv factors for each atom [nat]
+    ///
+    /// Output Parameters
+    /// @param rhostar_real - real part of structure factor [npw]
+    /// @param rhostar_imag - imaginary part of structure factor [npw]
+    void operator()(const Device* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag);
+};
+
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
@@ -434,6 +463,20 @@ struct cal_multi_dot_op<FPTYPE, base_device::DEVICE_GPU>{
                     const std::complex<FPTYPE>* psi);
 };
 
+template <typename FPTYPE>
+struct cal_stress_ewa_sincos_op<FPTYPE, base_device::DEVICE_GPU>
+{
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nat,
+                    const int& npw,
+                    const int& ig_gge0,
+                    const FPTYPE* gcar,
+                    const FPTYPE* tau,
+                    const FPTYPE* zv_facts,
+                    FPTYPE* rhostar_real,
+                    FPTYPE* rhostar_imag);
+};
+
 /**
  * The operator is used to compute the auxiliary amount of stress /force 
  * in parallel on the GPU. They identify type with the type provided and