style: replace all post-increments with pre-increments

kilinchange · kilinchange · commit 62ee2a6e3c07 · 2025-11-04T18:03:24.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,7 +60,7 @@ if(USE_CUDA)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
     file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
     add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
-    set_target_properties(infini_train_cuda_kernels PROPERTIES CUWDA_ARCHITECTURES "75;80;90")
+    set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
     target_link_libraries(infini_train_cuda_kernels glog CUDA::cudart CUDA::cublas CUDA::cuda_driver)
 
     add_library(infini_train STATIC ${SRC})
diff --git a/example/common/tokenizer.cc b/example/common/tokenizer.cc
@@ -59,7 +59,7 @@ int SampleMult(float *probabilities, int n, float coin) {
     // sample index from probabilities (they must sum to 1!)
     // coin is a random number in [0, 1), usually from RandomF32()
     float cdf = 0.0f;
-    for (int i = 0; i < n; i++) {
+    for (int i = 0; i < n; ++i) {
         cdf += probabilities[i];
         if (coin < cdf) {
             return i;
@@ -133,7 +133,7 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz
     LOG(INFO) << "start generate text:";
 
     const auto *cpu_device = DeviceManager::Instance()->GetDefaultDevice();
-    for (int t = prompt_len; t < text_length; t++) {
+    for (int t = prompt_len; t < text_length; ++t) {
         x = std::make_shared<infini_train::Tensor>(x->To(device)); // CPU->calc device
         // TODO(jym): use no_grad forward later
         auto logits = model.Forward({x})[0];
diff --git a/example/gpt2/net.cc b/example/gpt2/net.cc
@@ -189,7 +189,7 @@ GPT2::GPT2(const GPT2Config &config) : config_(config) {
         transformer[kWPELayerName] = std::make_shared<nn::Embedding>(config_.block_size, config_.n_embd);
         {
             std::vector<std::shared_ptr<nn::Module>> h;
-            for (int64_t i = 0; i < config_.n_layer; i++) { h.push_back(std::make_shared<Block>(config_)); }
+            for (int64_t i = 0; i < config_.n_layer; ++i) { h.push_back(std::make_shared<Block>(config_)); }
             transformer[kHLayerName] = std::make_shared<nn::Sequential>(std::move(h));
         }
         transformer[kLnFLayerName] = std::make_shared<nn::LayerNorm>(std::vector<int64_t>{config_.n_embd});
@@ -415,21 +415,21 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
                                                           nn::Embedding::kParamWeightName)];
     ReadMatrixAllFloat(ifs, static_cast<float *>(transformer_wpe_weight->DataPtr()), block_size, n_embd);
     // transformer.h.{i}.ln_1.weight
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor
             = state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                      std::to_string(idx), Block::kLn1LayerName, nn::LayerNorm::kParamWeightName)];
         ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
     }
     // transformer.h.{i}.ln_1.bias
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor
             = state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                      std::to_string(idx), Block::kLn1LayerName, nn::LayerNorm::kParamBiasName)];
         ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
     }
     // transformer.h.{i}.attn.c_attn.weight (ColumnParallelLinear, but actually applies on "rows")
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format(
             "{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
             Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName, tp::ColumnParallelLinear::kParamWeightName)];
@@ -461,7 +461,7 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
                                 /*row_start=*/2 * n_embd + rank * local_C, /*row_cnt=*/local_C);
     }
     // transformer.h.{i}.attn.c_attn.bias (ColumnParallelLinear)
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format(
             "{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
             Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName, tp::ColumnParallelLinear::kParamBiasName)];
@@ -492,56 +492,56 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
                              /*start=*/2 * n_embd + rank * local_C, /*cnt=*/local_C);
     }
     // transformer.h.{i}.attn.c_proj.weight (RowParallelLinear, but actually applies on "columns")
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format(
             "{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
             Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName, tp::RowParallelLinear::kParamWeightName)];
         ReadMatrixColShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd, n_embd, rank * in_pp, in_pp);
     }
     // transformer.h.{i}.attn.c_proj.bias (RowParallelLinear, no shard on bias)
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format(
             "{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
             Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName, tp::RowParallelLinear::kParamBiasName)];
         ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
     }
     // transformer.h.{i}.ln_2.weight
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor
             = state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                      std::to_string(idx), Block::kLn2LayerName, nn::LayerNorm::kParamWeightName)];
         ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
     }
     // transformer.h.{i}.ln_2.bias
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor
             = state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                      std::to_string(idx), Block::kLn2LayerName, nn::LayerNorm::kParamBiasName)];
         ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
     }
     // transformer.h.{i}.mlp.c_fc.weight (ColumnParallelLinear, but actually applies on "rows")
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                               std::to_string(idx), Block::kMlpLayerName, MLP::kCFcLayerName,
                                               tp::ColumnParallelLinear::kParamWeightName)];
         ReadMatrixRowShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), fc_out, n_embd, fc_start, fc_pp);
     }
     // transformer.h.{i}.mlp.c_fc.bias (ColumnParallelLinear)
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                               std::to_string(idx), Block::kMlpLayerName, MLP::kCFcLayerName,
                                               tp::ColumnParallelLinear::kParamBiasName)];
         ReadVectorShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), fc_out, fc_start, fc_pp);
     }
     // transformer.h.{i}.mlp.c_proj.weight (RowParallelLinear, but actually applies on "columns")
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                               std::to_string(idx), Block::kMlpLayerName, MLP::kCProjLayerName,
                                               tp::RowParallelLinear::kParamWeightName)];
         ReadMatrixColShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd, fc_out, rank * in4_pp, in4_pp);
     }
     // transformer.h.{i}.mlp.c_proj.bias (RowParallelLinear, no shard on bias)
-    for (int idx = 0; idx < n_layer; idx++) {
+    for (int idx = 0; idx < n_layer; ++idx) {
         auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
                                               std::to_string(idx), Block::kMlpLayerName, MLP::kCProjLayerName,
                                               tp::RowParallelLinear::kParamBiasName)];
diff --git a/example/llama3/net.cc b/example/llama3/net.cc
@@ -329,7 +329,7 @@ LLaMA3::LLaMA3(const LLaMA3Config &config) : config_(config) {
             config.vocab_size, config.n_embd, nn::parallel::global::GetSequenceParallelEnabled());
         {
             std::vector<std::shared_ptr<nn::Module>> h;
-            for (int64_t i = 0; i < config.n_layer; i++) { h.push_back(std::make_shared<Block>(config)); }
+            for (int64_t i = 0; i < config.n_layer; ++i) { h.push_back(std::make_shared<Block>(config)); }
             transformer[kHLayerName] = std::make_shared<nn::ModuleList>(std::move(h));
         }
         transformer[kLnFLayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);
diff --git a/infini_train/src/kernels/cpu/embedding.cc b/infini_train/src/kernels/cpu/embedding.cc
@@ -20,9 +20,9 @@ std::shared_ptr<Tensor> EmbeddingForward(const std::shared_ptr<Tensor> &input, c
     output_dims.push_back(embedding_dim);
     auto output = std::make_shared<Tensor>(output_dims, DataType::kFLOAT32);
 
-    for (int i = 0; i < input->NumElements(); i++) {
+    for (int i = 0; i < input->NumElements(); ++i) {
         int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
-        for (int j = 0; j < embedding_dim; j++) {
+        for (int j = 0; j < embedding_dim; ++j) {
             static_cast<float *>(output->DataPtr())[i * embedding_dim + j]
                 = static_cast<float *>(weight->DataPtr())[idx * embedding_dim + j];
         }
@@ -43,9 +43,9 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
     auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
     grad_weight->Fill<float>(0.0f);
 
-    for (int i = 0; i < input->NumElements(); i++) {
+    for (int i = 0; i < input->NumElements(); ++i) {
         int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
-        for (int j = 0; j < embedding_dim; j++) {
+        for (int j = 0; j < embedding_dim; ++j) {
             static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j] // <-- 修复这里
                 += static_cast<const float *>(grad_output->DataPtr())[i * embedding_dim + j];
         }
diff --git a/infini_train/src/kernels/cpu/layernorm.cc b/infini_train/src/kernels/cpu/layernorm.cc
@@ -31,16 +31,16 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
     mean->Fill<float>(0.0f);
     rstd->Fill<float>(0.0f);
 
-    for (int b = 0; b < batch_size; b++) {
-        for (int t = 0; t < max_seqlen; t++) {
+    for (int b = 0; b < batch_size; ++b) {
+        for (int t = 0; t < max_seqlen; ++t) {
             float m = 0.0f;
-            for (int i = 0; i < embed_dim; i++) {
+            for (int i = 0; i < embed_dim; ++i) {
                 m += static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i];
             }
             m = m / embed_dim;
 
             float v = 0.0f;
-            for (int i = 0; i < embed_dim; i++) {
+            for (int i = 0; i < embed_dim; ++i) {
                 float xshift
                     = static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - m;
                 v += xshift * xshift;
@@ -49,7 +49,7 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
 
             float s = 1.0f / sqrtf(v + eps);
 
-            for (int i = 0; i < embed_dim; i++) {
+            for (int i = 0; i < embed_dim; ++i) {
                 float n = (s
                            * (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i]
                               - m)); // normalize
@@ -88,15 +88,15 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
     grad_weight->Fill<float>(0.0f);
     grad_bias->Fill<float>(0.0f);
 
-    for (int b = 0; b < batch_size; b++) {
-        for (int t = 0; t < max_seqlen; t++) {
+    for (int b = 0; b < batch_size; ++b) {
+        for (int t = 0; t < max_seqlen; ++t) {
             float mean_bt = static_cast<float *>(mean->DataPtr())[b * max_seqlen + t];
             float rstd_bt = static_cast<float *>(rstd->DataPtr())[b * max_seqlen + t];
 
             // first: two reduce operations
             float dnorm_mean = 0.0f;
             float dnorm_norm_mean = 0.0f;
-            for (int i = 0; i < embed_dim; i++) {
+            for (int i = 0; i < embed_dim; ++i) {
                 float norm_bti
                     = (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - mean_bt)
                     * rstd_bt;
@@ -110,7 +110,7 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
             dnorm_norm_mean = dnorm_norm_mean / embed_dim;
 
             // now iterate again and accumulate all the gradients
-            for (int i = 0; i < embed_dim; i++) {
+            for (int i = 0; i < embed_dim; ++i) {
                 float norm_bti
                     = (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - mean_bt)
                     * rstd_bt;
diff --git a/infini_train/src/kernels/cpu/slice.cc b/infini_train/src/kernels/cpu/slice.cc
@@ -16,7 +16,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     CHECK_EQ(starts.size(), dims.size());
 
     std::vector<int64_t> new_dims;
-    for (int i = 0; i < starts.size(); i++) {
+    for (int i = 0; i < starts.size(); ++i) {
         CHECK_LE(starts[i], ends[i]);
         CHECK_LE(0, steps[i]);
         new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
@@ -76,7 +76,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     CHECK_EQ(starts.size(), dims.size());
 
     std::vector<int64_t> new_dims;
-    for (int i = 0; i < starts.size(); i++) {
+    for (int i = 0; i < starts.size(); ++i) {
         CHECK_LE(starts[i], ends[i]);
         CHECK_LE(0, steps[i]);
         new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
diff --git a/infini_train/src/kernels/cuda/slice.cu b/infini_train/src/kernels/cuda/slice.cu
@@ -36,7 +36,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
     const int64_t num_dims = dims.size();
 
     std::vector<int64_t> new_dims;
-    for (int i = 0; i < starts.size(); i++) {
+    for (int i = 0; i < starts.size(); ++i) {
         CHECK_LE(starts[i], ends[i]);
         CHECK_LE(0, steps[i]);
         new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
@@ -127,7 +127,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
     const int64_t num_dims = dims.size();
 
     std::vector<int64_t> new_dims;
-    for (int i = 0; i < starts.size(); i++) {
+    for (int i = 0; i < starts.size(); ++i) {
         CHECK_LE(starts[i], ends[i]);
         CHECK_LE(0, steps[i]);
         new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
diff --git a/infini_train/src/nn/parallel/process_group.cc b/infini_train/src/nn/parallel/process_group.cc
@@ -44,7 +44,7 @@ ProcessGroup::ProcessGroup(const std::vector<int> &device_indices) : comm_size_(
     comms_.resize(comm_size_);
     NCCL_CHECK(ncclCommInitAll(comms_.data(), comm_size_, device_indices.data()));
 
-    for (int i = 0; i < comm_size_; i++) {
+    for (int i = 0; i < comm_size_; ++i) {
         auto device = DeviceManager::Instance()->GetDevice(DeviceType::kCUDA, device_indices[i]);
         devices_.push_back(device);
         device_comm_map_[device] = comms_[i];

Original file line number	Diff line number	Diff line change
`@@ -329,7 +329,7 @@ LLaMA3::LLaMA3(const LLaMA3Config &config) : config_(config) {`
`329`	`329`	`config.vocab_size, config.n_embd, nn::parallel::global::GetSequenceParallelEnabled());`
`330`	`330`	`{`
`331`	`331`	`std::vector<std::shared_ptr<nn::Module>> h;`
`332`		`- for (int64_t i = 0; i < config.n_layer; i++) { h.push_back(std::make_shared<Block>(config)); }`
	`332`	`+ for (int64_t i = 0; i < config.n_layer; ++i) { h.push_back(std::make_shared<Block>(config)); }`
`333`	`333`	`transformer[kHLayerName] = std::make_shared<nn::ModuleList>(std::move(h));`
`334`	`334`	`}`
`335`	`335`	`transformer[kLnFLayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);`