Skip to content

Commit 62ee2a6

Browse files
committed
style: replace all post-increments with pre-increments
1 parent ddb400b commit 62ee2a6

File tree

9 files changed

+35
-35
lines changed

9 files changed

+35
-35
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ if(USE_CUDA)
6060
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
6161
file(GLOB_RECURSE CUDA_KERNELS ${PROJECT_SOURCE_DIR}/infini_train/src/*.cu)
6262
add_library(infini_train_cuda_kernels STATIC ${CUDA_KERNELS})
63-
set_target_properties(infini_train_cuda_kernels PROPERTIES CUWDA_ARCHITECTURES "75;80;90")
63+
set_target_properties(infini_train_cuda_kernels PROPERTIES CUDA_ARCHITECTURES "75;80;90")
6464
target_link_libraries(infini_train_cuda_kernels glog CUDA::cudart CUDA::cublas CUDA::cuda_driver)
6565

6666
add_library(infini_train STATIC ${SRC})

example/common/tokenizer.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ int SampleMult(float *probabilities, int n, float coin) {
5959
// sample index from probabilities (they must sum to 1!)
6060
// coin is a random number in [0, 1), usually from RandomF32()
6161
float cdf = 0.0f;
62-
for (int i = 0; i < n; i++) {
62+
for (int i = 0; i < n; ++i) {
6363
cdf += probabilities[i];
6464
if (coin < cdf) {
6565
return i;
@@ -133,7 +133,7 @@ void Tokenizer::GenerateText(infini_train::nn::Module &model, uint32_t batch_siz
133133
LOG(INFO) << "start generate text:";
134134

135135
const auto *cpu_device = DeviceManager::Instance()->GetDefaultDevice();
136-
for (int t = prompt_len; t < text_length; t++) {
136+
for (int t = prompt_len; t < text_length; ++t) {
137137
x = std::make_shared<infini_train::Tensor>(x->To(device)); // CPU->calc device
138138
// TODO(jym): use no_grad forward later
139139
auto logits = model.Forward({x})[0];

example/gpt2/net.cc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ GPT2::GPT2(const GPT2Config &config) : config_(config) {
189189
transformer[kWPELayerName] = std::make_shared<nn::Embedding>(config_.block_size, config_.n_embd);
190190
{
191191
std::vector<std::shared_ptr<nn::Module>> h;
192-
for (int64_t i = 0; i < config_.n_layer; i++) { h.push_back(std::make_shared<Block>(config_)); }
192+
for (int64_t i = 0; i < config_.n_layer; ++i) { h.push_back(std::make_shared<Block>(config_)); }
193193
transformer[kHLayerName] = std::make_shared<nn::Sequential>(std::move(h));
194194
}
195195
transformer[kLnFLayerName] = std::make_shared<nn::LayerNorm>(std::vector<int64_t>{config_.n_embd});
@@ -415,21 +415,21 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
415415
nn::Embedding::kParamWeightName)];
416416
ReadMatrixAllFloat(ifs, static_cast<float *>(transformer_wpe_weight->DataPtr()), block_size, n_embd);
417417
// transformer.h.{i}.ln_1.weight
418-
for (int idx = 0; idx < n_layer; idx++) {
418+
for (int idx = 0; idx < n_layer; ++idx) {
419419
auto &tensor
420420
= state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
421421
std::to_string(idx), Block::kLn1LayerName, nn::LayerNorm::kParamWeightName)];
422422
ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
423423
}
424424
// transformer.h.{i}.ln_1.bias
425-
for (int idx = 0; idx < n_layer; idx++) {
425+
for (int idx = 0; idx < n_layer; ++idx) {
426426
auto &tensor
427427
= state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
428428
std::to_string(idx), Block::kLn1LayerName, nn::LayerNorm::kParamBiasName)];
429429
ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
430430
}
431431
// transformer.h.{i}.attn.c_attn.weight (ColumnParallelLinear, but actually applies on "rows")
432-
for (int idx = 0; idx < n_layer; idx++) {
432+
for (int idx = 0; idx < n_layer; ++idx) {
433433
auto &tensor = state_dict[std::format(
434434
"{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
435435
Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName, tp::ColumnParallelLinear::kParamWeightName)];
@@ -461,7 +461,7 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
461461
/*row_start=*/2 * n_embd + rank * local_C, /*row_cnt=*/local_C);
462462
}
463463
// transformer.h.{i}.attn.c_attn.bias (ColumnParallelLinear)
464-
for (int idx = 0; idx < n_layer; idx++) {
464+
for (int idx = 0; idx < n_layer; ++idx) {
465465
auto &tensor = state_dict[std::format(
466466
"{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
467467
Block::kAttnLayerName, CausalSelfAttention::kCAttnLayerName, tp::ColumnParallelLinear::kParamBiasName)];
@@ -492,56 +492,56 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
492492
/*start=*/2 * n_embd + rank * local_C, /*cnt=*/local_C);
493493
}
494494
// transformer.h.{i}.attn.c_proj.weight (RowParallelLinear, but actually applies on "columns")
495-
for (int idx = 0; idx < n_layer; idx++) {
495+
for (int idx = 0; idx < n_layer; ++idx) {
496496
auto &tensor = state_dict[std::format(
497497
"{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
498498
Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName, tp::RowParallelLinear::kParamWeightName)];
499499
ReadMatrixColShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd, n_embd, rank * in_pp, in_pp);
500500
}
501501
// transformer.h.{i}.attn.c_proj.bias (RowParallelLinear, no shard on bias)
502-
for (int idx = 0; idx < n_layer; idx++) {
502+
for (int idx = 0; idx < n_layer; ++idx) {
503503
auto &tensor = state_dict[std::format(
504504
"{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName, std::to_string(idx),
505505
Block::kAttnLayerName, CausalSelfAttention::kCProjLayerName, tp::RowParallelLinear::kParamBiasName)];
506506
ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
507507
}
508508
// transformer.h.{i}.ln_2.weight
509-
for (int idx = 0; idx < n_layer; idx++) {
509+
for (int idx = 0; idx < n_layer; ++idx) {
510510
auto &tensor
511511
= state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
512512
std::to_string(idx), Block::kLn2LayerName, nn::LayerNorm::kParamWeightName)];
513513
ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
514514
}
515515
// transformer.h.{i}.ln_2.bias
516-
for (int idx = 0; idx < n_layer; idx++) {
516+
for (int idx = 0; idx < n_layer; ++idx) {
517517
auto &tensor
518518
= state_dict[std::format("{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
519519
std::to_string(idx), Block::kLn2LayerName, nn::LayerNorm::kParamBiasName)];
520520
ReadVectorAllFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd);
521521
}
522522
// transformer.h.{i}.mlp.c_fc.weight (ColumnParallelLinear, but actually applies on "rows")
523-
for (int idx = 0; idx < n_layer; idx++) {
523+
for (int idx = 0; idx < n_layer; ++idx) {
524524
auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
525525
std::to_string(idx), Block::kMlpLayerName, MLP::kCFcLayerName,
526526
tp::ColumnParallelLinear::kParamWeightName)];
527527
ReadMatrixRowShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), fc_out, n_embd, fc_start, fc_pp);
528528
}
529529
// transformer.h.{i}.mlp.c_fc.bias (ColumnParallelLinear)
530-
for (int idx = 0; idx < n_layer; idx++) {
530+
for (int idx = 0; idx < n_layer; ++idx) {
531531
auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
532532
std::to_string(idx), Block::kMlpLayerName, MLP::kCFcLayerName,
533533
tp::ColumnParallelLinear::kParamBiasName)];
534534
ReadVectorShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), fc_out, fc_start, fc_pp);
535535
}
536536
// transformer.h.{i}.mlp.c_proj.weight (RowParallelLinear, but actually applies on "columns")
537-
for (int idx = 0; idx < n_layer; idx++) {
537+
for (int idx = 0; idx < n_layer; ++idx) {
538538
auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
539539
std::to_string(idx), Block::kMlpLayerName, MLP::kCProjLayerName,
540540
tp::RowParallelLinear::kParamWeightName)];
541541
ReadMatrixColShardFloat(ifs, static_cast<float *>(tensor->DataPtr()), n_embd, fc_out, rank * in4_pp, in4_pp);
542542
}
543543
// transformer.h.{i}.mlp.c_proj.bias (RowParallelLinear, no shard on bias)
544-
for (int idx = 0; idx < n_layer; idx++) {
544+
for (int idx = 0; idx < n_layer; ++idx) {
545545
auto &tensor = state_dict[std::format("{}.{}.{}.{}.{}.{}", GPT2::kTransformerLayerName, GPT2::kHLayerName,
546546
std::to_string(idx), Block::kMlpLayerName, MLP::kCProjLayerName,
547547
tp::RowParallelLinear::kParamBiasName)];

example/llama3/net.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ LLaMA3::LLaMA3(const LLaMA3Config &config) : config_(config) {
329329
config.vocab_size, config.n_embd, nn::parallel::global::GetSequenceParallelEnabled());
330330
{
331331
std::vector<std::shared_ptr<nn::Module>> h;
332-
for (int64_t i = 0; i < config.n_layer; i++) { h.push_back(std::make_shared<Block>(config)); }
332+
for (int64_t i = 0; i < config.n_layer; ++i) { h.push_back(std::make_shared<Block>(config)); }
333333
transformer[kHLayerName] = std::make_shared<nn::ModuleList>(std::move(h));
334334
}
335335
transformer[kLnFLayerName] = std::make_shared<RMSNorm>(config.n_embd, config.norm_eps);

infini_train/src/kernels/cpu/embedding.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ std::shared_ptr<Tensor> EmbeddingForward(const std::shared_ptr<Tensor> &input, c
2020
output_dims.push_back(embedding_dim);
2121
auto output = std::make_shared<Tensor>(output_dims, DataType::kFLOAT32);
2222

23-
for (int i = 0; i < input->NumElements(); i++) {
23+
for (int i = 0; i < input->NumElements(); ++i) {
2424
int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
25-
for (int j = 0; j < embedding_dim; j++) {
25+
for (int j = 0; j < embedding_dim; ++j) {
2626
static_cast<float *>(output->DataPtr())[i * embedding_dim + j]
2727
= static_cast<float *>(weight->DataPtr())[idx * embedding_dim + j];
2828
}
@@ -43,9 +43,9 @@ std::shared_ptr<Tensor> EmbeddingBackward(const std::shared_ptr<Tensor> &input,
4343
auto grad_weight = std::make_shared<Tensor>(weight_dims, DataType::kFLOAT32);
4444
grad_weight->Fill<float>(0.0f);
4545

46-
for (int i = 0; i < input->NumElements(); i++) {
46+
for (int i = 0; i < input->NumElements(); ++i) {
4747
int idx = static_cast<int>(static_cast<const int64_t *>(input->DataPtr())[i]);
48-
for (int j = 0; j < embedding_dim; j++) {
48+
for (int j = 0; j < embedding_dim; ++j) {
4949
static_cast<float *>(grad_weight->DataPtr())[idx * embedding_dim + j] // <-- 修复这里
5050
+= static_cast<const float *>(grad_output->DataPtr())[i * embedding_dim + j];
5151
}

infini_train/src/kernels/cpu/layernorm.cc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,16 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
3131
mean->Fill<float>(0.0f);
3232
rstd->Fill<float>(0.0f);
3333

34-
for (int b = 0; b < batch_size; b++) {
35-
for (int t = 0; t < max_seqlen; t++) {
34+
for (int b = 0; b < batch_size; ++b) {
35+
for (int t = 0; t < max_seqlen; ++t) {
3636
float m = 0.0f;
37-
for (int i = 0; i < embed_dim; i++) {
37+
for (int i = 0; i < embed_dim; ++i) {
3838
m += static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i];
3939
}
4040
m = m / embed_dim;
4141

4242
float v = 0.0f;
43-
for (int i = 0; i < embed_dim; i++) {
43+
for (int i = 0; i < embed_dim; ++i) {
4444
float xshift
4545
= static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - m;
4646
v += xshift * xshift;
@@ -49,7 +49,7 @@ LayerNormForward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Ten
4949

5050
float s = 1.0f / sqrtf(v + eps);
5151

52-
for (int i = 0; i < embed_dim; i++) {
52+
for (int i = 0; i < embed_dim; ++i) {
5353
float n = (s
5454
* (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i]
5555
- m)); // normalize
@@ -88,15 +88,15 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
8888
grad_weight->Fill<float>(0.0f);
8989
grad_bias->Fill<float>(0.0f);
9090

91-
for (int b = 0; b < batch_size; b++) {
92-
for (int t = 0; t < max_seqlen; t++) {
91+
for (int b = 0; b < batch_size; ++b) {
92+
for (int t = 0; t < max_seqlen; ++t) {
9393
float mean_bt = static_cast<float *>(mean->DataPtr())[b * max_seqlen + t];
9494
float rstd_bt = static_cast<float *>(rstd->DataPtr())[b * max_seqlen + t];
9595

9696
// first: two reduce operations
9797
float dnorm_mean = 0.0f;
9898
float dnorm_norm_mean = 0.0f;
99-
for (int i = 0; i < embed_dim; i++) {
99+
for (int i = 0; i < embed_dim; ++i) {
100100
float norm_bti
101101
= (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - mean_bt)
102102
* rstd_bt;
@@ -110,7 +110,7 @@ LayerNormBackward(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Te
110110
dnorm_norm_mean = dnorm_norm_mean / embed_dim;
111111

112112
// now iterate again and accumulate all the gradients
113-
for (int i = 0; i < embed_dim; i++) {
113+
for (int i = 0; i < embed_dim; ++i) {
114114
float norm_bti
115115
= (static_cast<float *>(input->DataPtr())[b * max_seqlen * embed_dim + t * embed_dim + i] - mean_bt)
116116
* rstd_bt;

infini_train/src/kernels/cpu/slice.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
1616
CHECK_EQ(starts.size(), dims.size());
1717

1818
std::vector<int64_t> new_dims;
19-
for (int i = 0; i < starts.size(); i++) {
19+
for (int i = 0; i < starts.size(); ++i) {
2020
CHECK_LE(starts[i], ends[i]);
2121
CHECK_LE(0, steps[i]);
2222
new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
@@ -76,7 +76,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
7676
CHECK_EQ(starts.size(), dims.size());
7777

7878
std::vector<int64_t> new_dims;
79-
for (int i = 0; i < starts.size(); i++) {
79+
for (int i = 0; i < starts.size(); ++i) {
8080
CHECK_LE(starts[i], ends[i]);
8181
CHECK_LE(0, steps[i]);
8282
new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);

infini_train/src/kernels/cuda/slice.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ std::shared_ptr<Tensor> SliceForward(const std::shared_ptr<Tensor> &input, const
3636
const int64_t num_dims = dims.size();
3737

3838
std::vector<int64_t> new_dims;
39-
for (int i = 0; i < starts.size(); i++) {
39+
for (int i = 0; i < starts.size(); ++i) {
4040
CHECK_LE(starts[i], ends[i]);
4141
CHECK_LE(0, steps[i]);
4242
new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);
@@ -127,7 +127,7 @@ std::shared_ptr<Tensor> SliceBackward(const std::shared_ptr<Tensor> &grad_output
127127
const int64_t num_dims = dims.size();
128128

129129
std::vector<int64_t> new_dims;
130-
for (int i = 0; i < starts.size(); i++) {
130+
for (int i = 0; i < starts.size(); ++i) {
131131
CHECK_LE(starts[i], ends[i]);
132132
CHECK_LE(0, steps[i]);
133133
new_dims.push_back((ends[i] - starts[i] + steps[i] - 1) / steps[i]);

infini_train/src/nn/parallel/process_group.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ ProcessGroup::ProcessGroup(const std::vector<int> &device_indices) : comm_size_(
4444
comms_.resize(comm_size_);
4545
NCCL_CHECK(ncclCommInitAll(comms_.data(), comm_size_, device_indices.data()));
4646

47-
for (int i = 0; i < comm_size_; i++) {
47+
for (int i = 0; i < comm_size_; ++i) {
4848
auto device = DeviceManager::Instance()->GetDevice(DeviceType::kCUDA, device_indices[i]);
4949
devices_.push_back(device);
5050
device_comm_map_[device] = comms_[i];

0 commit comments

Comments
 (0)