deepmodeling · Copilot · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025 · Aug 22, 2025
diff --git a/doc/development/pytorch-profiler.md b/doc/development/pytorch-profiler.md
@@ -0,0 +1,52 @@
+# PyTorch C++ Profiler Integration Test
+
+This test demonstrates the PyTorch profiler integration with the C++ backend.
+
+## Usage
+
+1. Set environment variables:
+```bash
+export DP_ENABLE_PYTORCH_PROFILER=1
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results
+```
+
+2. Run your DeepMD-kit C++ application
+
+3. Check for profiler output in the specified directory:
+```bash
+# For single-rank or non-MPI usage
+ls -la ./profiler_results/pytorch_profiler_trace.json
+
+# For MPI usage, each rank gets its own file
+ls -la ./profiler_results/pytorch_profiler_trace_rank*.json
+```
+
+For MPI applications, you can use different output directories per rank:
+```bash
+# Example for rank 0
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results_rank0
+# Example for rank 1  
+export DP_PYTORCH_PROFILER_OUTPUT_DIR=./profiler_results_rank1
+```
+
+## Environment Variables
+
+- `DP_ENABLE_PYTORCH_PROFILER`: Set to `1` or `true` to enable profiling
+- `DP_PYTORCH_PROFILER_OUTPUT_DIR`: Directory for profiler output (default: `./profiler_output`)
+
+## Implementation Details
+
+The profiler uses PyTorch's modern `torch::profiler` API and automatically:
+- Creates the output directory if it doesn't exist
+- Profiles all forward pass operations in DeepPotPT and DeepSpinPT
+- Saves profiling results to a JSON file when the object is destroyed
+- Automatically includes MPI rank in filename when MPI is available and initialized
+
+## Output Files
+
+- **Single-rank or non-MPI usage**: `pytorch_profiler_trace.json`
+- **MPI usage**: `pytorch_profiler_trace_rank{rank}.json` (e.g., `pytorch_profiler_trace_rank0.json`, `pytorch_profiler_trace_rank1.json`)
+
+This ensures that each MPI rank saves its profiling data to a separate file, preventing conflicts in multi-rank simulations.
+
+This is intended for development and debugging purposes.
diff --git a/doc/env.md b/doc/env.md
@@ -90,3 +90,19 @@ These environment variables also apply to third-party programs using the C++ int
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
 
 :::
+
+:::{envvar} DP_ENABLE_PYTORCH_PROFILER
+
+**Choices**: `0`, `1`, `true`; **Default**: `0`
+
+{{ pytorch_icon }} Enable PyTorch profiler for C++ backend. This is for development purposes.
+
+:::
+
+:::{envvar} DP_PYTORCH_PROFILER_OUTPUT_DIR
+
+**Default**: `./profiler_output`
+
+{{ pytorch_icon }} Output directory for PyTorch profiler traces when `DP_ENABLE_PYTORCH_PROFILER` is enabled.
+
+:::
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
@@ -49,6 +49,11 @@ set_target_properties(
              INSTALL_RPATH_USE_LINK_PATH TRUE
              BUILD_RPATH "$ORIGIN/../op/tf;$ORIGIN/../op/pt;$ORIGIN/../op/pd")
 target_compile_definitions(${libname} PRIVATE TF_PRIVATE)
+find_package(MPI)
+if(MPI_FOUND)
+  target_link_libraries(${libname} PRIVATE MPI::MPI_CXX)
+  target_compile_definitions(${libname} PRIVATE USE_MPI)
+endif()
 if(CMAKE_TESTING_ENABLED)
   target_link_libraries(${libname} PRIVATE coverage_config)
 endif()

diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
@@ -340,6 +340,12 @@ class DeepPotPT : public DeepPotBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  // PyTorch profiler
+  bool profiler_enabled;
+  std::string profiler_output_dir;
+#ifdef BUILD_PYTORCH
+  std::shared_ptr<torch::profiler::Result> profiler_result;
+#endif
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.

diff --git a/source/api_cc/include/DeepSpinPT.h b/source/api_cc/include/DeepSpinPT.h
@@ -262,6 +262,12 @@ class DeepSpinPT : public DeepSpinBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  // PyTorch profiler
+  bool profiler_enabled;
+  std::string profiler_output_dir;
+#ifdef BUILD_PYTORCH
+  std::shared_ptr<torch::profiler::Result> profiler_result;
+#endif
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.

diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
@@ -163,6 +163,36 @@ void select_map_inv(typename std::vector<VT>::iterator out,
  **/
 void get_env_nthreads(int& num_intra_nthreads, int& num_inter_nthreads);
 
+/**
+ * @brief Get PyTorch profiler configuration from environment variables.
+ * @param[out] enable_profiler Whether to enable the profiler. Read from
+ *DP_ENABLE_PYTORCH_PROFILER.
+ * @param[out] output_dir Output directory for profiler traces. Read from
+ *DP_PYTORCH_PROFILER_OUTPUT_DIR.
+ **/
+void get_env_pytorch_profiler(bool& enable_profiler, std::string& output_dir);
+
+/**
+ * @brief Get MPI rank. Currently disabled in api_cc to avoid MPI linking dependencies.
+ * @return Always returns -1. Users can distinguish ranks using different output directories.
+ **/
+int get_mpi_rank();
+
+/**
+ * @brief Create directories recursively in a cross-platform way.
+ * @param path The path to create.
+ * @return true if successful or directory already exists, false otherwise.
+ **/
+bool create_directories(const std::string& path);
+
+/**
+ * @brief Join two path components using platform-appropriate separator.
+ * @param path1 The first path component.
+ * @param path2 The second path component.
+ * @return The joined path.
+ **/
+std::string join_path(const std::string& path1, const std::string& path2);
+
 /**
  * @brief Dynamically load OP library. This should be called before loading
  * graphs.

diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
@@ -46,11 +46,11 @@ torch::Tensor createNlistTensor(const std::vector<std::vector<int>>& data) {
   int nnei = nloc > 0 ? total_size / nloc : 0;
   return flat_tensor.view({1, nloc, nnei});
 }
-DeepPotPT::DeepPotPT() : inited(false) {}
+DeepPotPT::DeepPotPT() : inited(false), profiler_enabled(false) {}
 DeepPotPT::DeepPotPT(const std::string& model,
                      const int& gpu_rank,
                      const std::string& file_content)
-    : inited(false) {
+    : inited(false), profiler_enabled(false) {
   try {
     translate_error([&] { init(model, gpu_rank, file_content); });
   } catch (...) {
@@ -110,6 +110,26 @@ void DeepPotPT::init(const std::string& model,
     }
   }
 
+  // Initialize PyTorch profiler
+  get_env_pytorch_profiler(profiler_enabled, profiler_output_dir);
+  if (profiler_enabled) {
+#ifdef BUILD_PYTORCH
+    // Create output directory if it doesn't exist
+    if (!create_directories(profiler_output_dir)) {
+      std::cerr << "Warning: Failed to create profiler output directory: " << profiler_output_dir << std::endl;
+    }
+
+    std::cout << "PyTorch profiler enabled. Output directory: " << profiler_output_dir << std::endl;
+    // Start profiling using new API
+    torch::profiler::profile({
+        torch::profiler::ProfilerActivity::CPU,
+        torch::profiler::ProfilerActivity::CUDA,
+    }, true, true, false);  // record_shapes, profile_memory, with_stack
+#else
+    std::cerr << "Warning: PyTorch profiler requested but BUILD_PYTORCH not defined" << std::endl;
+#endif
+  }
+
   auto rcut_ = module.run_method("get_rcut").toDouble();
   rcut = static_cast<double>(rcut_);
   ntypes = module.run_method("get_ntypes").toInt();
@@ -119,7 +139,31 @@ void DeepPotPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepPotPT::~DeepPotPT() {}
+DeepPotPT::~DeepPotPT() {
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled) {
+    try {
+      // Save profiler results to file with MPI rank if available
+      int rank = get_mpi_rank();
+      std::string output_file;
+      if (rank >= 0) {
+        // MPI is available and initialized, include rank in filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace_rank" + std::to_string(rank) + ".json");
+      } else {
+        // MPI not available or not initialized, use original filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace.json");
+      }
+      profiler_result = torch::profiler::disableProfiler();
+      if (profiler_result) {
+        profiler_result->save(output_file);
+        std::cout << "PyTorch profiler results saved to: " << output_file << std::endl;
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Warning: Failed to save profiler results: " << e.what() << std::endl;
+    }
+  }
+#endif
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepPotPT::compute(ENERGYVTYPE& ener,
@@ -234,6 +278,14 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
             options)
             .to(device);
   }
+
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+
   c10::Dict<c10::IValue, c10::IValue> outputs =
       (do_message_passing)
           ? module
@@ -383,6 +435,14 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   inputs.push_back(aparam_tensor);
   bool do_atom_virial_tensor = atomic;
   inputs.push_back(do_atom_virial_tensor);
+
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+
   c10::Dict<c10::IValue, c10::IValue> outputs =
       module.forward(inputs).toGenericDict();
   c10::IValue energy_ = outputs.at("energy");

diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
@@ -46,11 +46,11 @@ torch::Tensor createNlistTensor2(const std::vector<std::vector<int>>& data) {
   int nnei = nloc > 0 ? total_size / nloc : 0;
   return flat_tensor.view({1, nloc, nnei});
 }
-DeepSpinPT::DeepSpinPT() : inited(false) {}
+DeepSpinPT::DeepSpinPT() : inited(false), profiler_enabled(false) {}
 DeepSpinPT::DeepSpinPT(const std::string& model,
                        const int& gpu_rank,
                        const std::string& file_content)
-    : inited(false) {
+    : inited(false), profiler_enabled(false) {
   try {
     translate_error([&] { init(model, gpu_rank, file_content); });
   } catch (...) {
@@ -110,6 +110,26 @@ void DeepSpinPT::init(const std::string& model,
     }
   }
 
+  // Initialize PyTorch profiler
+  get_env_pytorch_profiler(profiler_enabled, profiler_output_dir);
+  if (profiler_enabled) {
+#ifdef BUILD_PYTORCH
+    // Create output directory if it doesn't exist
+    if (!create_directories(profiler_output_dir)) {
+      std::cerr << "Warning: Failed to create profiler output directory: " << profiler_output_dir << std::endl;
+    }
+
+    std::cout << "PyTorch profiler enabled. Output directory: " << profiler_output_dir << std::endl;
+    // Start profiling using new API
+    torch::profiler::profile({
+        torch::profiler::ProfilerActivity::CPU,
+        torch::profiler::ProfilerActivity::CUDA,
+    }, true, true, false);  // record_shapes, profile_memory, with_stack
+#else
+    std::cerr << "Warning: PyTorch profiler requested but BUILD_PYTORCH not defined" << std::endl;
+#endif
+  }
+
   auto rcut_ = module.run_method("get_rcut").toDouble();
   rcut = static_cast<double>(rcut_);
   ntypes = module.run_method("get_ntypes").toInt();
@@ -119,7 +139,31 @@ void DeepSpinPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepSpinPT::~DeepSpinPT() {}
+DeepSpinPT::~DeepSpinPT() {
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled) {
+    try {
+      // Save profiler results to file with MPI rank if available
+      int rank = get_mpi_rank();
+      std::string output_file;
+      if (rank >= 0) {
+        // MPI is available and initialized, include rank in filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace_rank" + std::to_string(rank) + ".json");
+      } else {
+        // MPI not available or not initialized, use original filename
+        output_file = join_path(profiler_output_dir, "pytorch_profiler_trace.json");
+      }
+      profiler_result = torch::profiler::disableProfiler();
+      if (profiler_result) {
+        profiler_result->save(output_file);
+        std::cout << "PyTorch profiler results saved to: " << output_file << std::endl;
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Warning: Failed to save profiler results: " << e.what() << std::endl;
+    }
+  }
+#endif
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepSpinPT::compute(ENERGYVTYPE& ener,
@@ -410,6 +454,14 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
   inputs.push_back(aparam_tensor);
   bool do_atom_virial_tensor = atomic;
   inputs.push_back(do_atom_virial_tensor);
+
+  // Start profiling if enabled
+#ifdef BUILD_PYTORCH
+  if (profiler_enabled && profiler) {
+    profiler->step();
+  }
+#endif
+
   c10::Dict<c10::IValue, c10::IValue> outputs =
       module.forward(inputs).toGenericDict();
   c10::IValue energy_ = outputs.at("energy");