@@ -189,7 +189,7 @@ GPT2::GPT2(const GPT2Config &config) : config_(config) {
189189 transformer[kWPELayerName ] = std::make_shared<nn::Embedding>(config_.block_size , config_.n_embd );
190190 {
191191 std::vector<std::shared_ptr<nn::Module>> h;
192- for (int64_t i = 0 ; i < config_.n_layer ; i++ ) { h.push_back (std::make_shared<Block>(config_)); }
192+ for (int64_t i = 0 ; i < config_.n_layer ; ++i ) { h.push_back (std::make_shared<Block>(config_)); }
193193 transformer[kHLayerName ] = std::make_shared<nn::Sequential>(std::move (h));
194194 }
195195 transformer[kLnFLayerName ] = std::make_shared<nn::LayerNorm>(std::vector<int64_t >{config_.n_embd });
@@ -415,21 +415,21 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
415415 nn::Embedding::kParamWeightName )];
416416 ReadMatrixAllFloat (ifs, static_cast <float *>(transformer_wpe_weight->DataPtr ()), block_size, n_embd);
417417 // transformer.h.{i}.ln_1.weight
418- for (int idx = 0 ; idx < n_layer; idx++ ) {
418+ for (int idx = 0 ; idx < n_layer; ++idx ) {
419419 auto &tensor
420420 = state_dict[std::format (" {}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
421421 std::to_string (idx), Block::kLn1LayerName , nn::LayerNorm::kParamWeightName )];
422422 ReadVectorAllFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd);
423423 }
424424 // transformer.h.{i}.ln_1.bias
425- for (int idx = 0 ; idx < n_layer; idx++ ) {
425+ for (int idx = 0 ; idx < n_layer; ++idx ) {
426426 auto &tensor
427427 = state_dict[std::format (" {}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
428428 std::to_string (idx), Block::kLn1LayerName , nn::LayerNorm::kParamBiasName )];
429429 ReadVectorAllFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd);
430430 }
431431 // transformer.h.{i}.attn.c_attn.weight (ColumnParallelLinear, but actually applies on "rows")
432- for (int idx = 0 ; idx < n_layer; idx++ ) {
432+ for (int idx = 0 ; idx < n_layer; ++idx ) {
433433 auto &tensor = state_dict[std::format (
434434 " {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName , std::to_string (idx),
435435 Block::kAttnLayerName , CausalSelfAttention::kCAttnLayerName , tp::ColumnParallelLinear::kParamWeightName )];
@@ -461,7 +461,7 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
461461 /* row_start=*/ 2 * n_embd + rank * local_C, /* row_cnt=*/ local_C);
462462 }
463463 // transformer.h.{i}.attn.c_attn.bias (ColumnParallelLinear)
464- for (int idx = 0 ; idx < n_layer; idx++ ) {
464+ for (int idx = 0 ; idx < n_layer; ++idx ) {
465465 auto &tensor = state_dict[std::format (
466466 " {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName , std::to_string (idx),
467467 Block::kAttnLayerName , CausalSelfAttention::kCAttnLayerName , tp::ColumnParallelLinear::kParamBiasName )];
@@ -492,56 +492,56 @@ std::shared_ptr<GPT2> GPT2::FromLLMC(const std::string &filepath) {
492492 /* start=*/ 2 * n_embd + rank * local_C, /* cnt=*/ local_C);
493493 }
494494 // transformer.h.{i}.attn.c_proj.weight (RowParallelLinear, but actually applies on "columns")
495- for (int idx = 0 ; idx < n_layer; idx++ ) {
495+ for (int idx = 0 ; idx < n_layer; ++idx ) {
496496 auto &tensor = state_dict[std::format (
497497 " {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName , std::to_string (idx),
498498 Block::kAttnLayerName , CausalSelfAttention::kCProjLayerName , tp::RowParallelLinear::kParamWeightName )];
499499 ReadMatrixColShardFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd, n_embd, rank * in_pp, in_pp);
500500 }
501501 // transformer.h.{i}.attn.c_proj.bias (RowParallelLinear, no shard on bias)
502- for (int idx = 0 ; idx < n_layer; idx++ ) {
502+ for (int idx = 0 ; idx < n_layer; ++idx ) {
503503 auto &tensor = state_dict[std::format (
504504 " {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName , std::to_string (idx),
505505 Block::kAttnLayerName , CausalSelfAttention::kCProjLayerName , tp::RowParallelLinear::kParamBiasName )];
506506 ReadVectorAllFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd);
507507 }
508508 // transformer.h.{i}.ln_2.weight
509- for (int idx = 0 ; idx < n_layer; idx++ ) {
509+ for (int idx = 0 ; idx < n_layer; ++idx ) {
510510 auto &tensor
511511 = state_dict[std::format (" {}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
512512 std::to_string (idx), Block::kLn2LayerName , nn::LayerNorm::kParamWeightName )];
513513 ReadVectorAllFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd);
514514 }
515515 // transformer.h.{i}.ln_2.bias
516- for (int idx = 0 ; idx < n_layer; idx++ ) {
516+ for (int idx = 0 ; idx < n_layer; ++idx ) {
517517 auto &tensor
518518 = state_dict[std::format (" {}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
519519 std::to_string (idx), Block::kLn2LayerName , nn::LayerNorm::kParamBiasName )];
520520 ReadVectorAllFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd);
521521 }
522522 // transformer.h.{i}.mlp.c_fc.weight (ColumnParallelLinear, but actually applies on "rows")
523- for (int idx = 0 ; idx < n_layer; idx++ ) {
523+ for (int idx = 0 ; idx < n_layer; ++idx ) {
524524 auto &tensor = state_dict[std::format (" {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
525525 std::to_string (idx), Block::kMlpLayerName , MLP::kCFcLayerName ,
526526 tp::ColumnParallelLinear::kParamWeightName )];
527527 ReadMatrixRowShardFloat (ifs, static_cast <float *>(tensor->DataPtr ()), fc_out, n_embd, fc_start, fc_pp);
528528 }
529529 // transformer.h.{i}.mlp.c_fc.bias (ColumnParallelLinear)
530- for (int idx = 0 ; idx < n_layer; idx++ ) {
530+ for (int idx = 0 ; idx < n_layer; ++idx ) {
531531 auto &tensor = state_dict[std::format (" {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
532532 std::to_string (idx), Block::kMlpLayerName , MLP::kCFcLayerName ,
533533 tp::ColumnParallelLinear::kParamBiasName )];
534534 ReadVectorShardFloat (ifs, static_cast <float *>(tensor->DataPtr ()), fc_out, fc_start, fc_pp);
535535 }
536536 // transformer.h.{i}.mlp.c_proj.weight (RowParallelLinear, but actually applies on "columns")
537- for (int idx = 0 ; idx < n_layer; idx++ ) {
537+ for (int idx = 0 ; idx < n_layer; ++idx ) {
538538 auto &tensor = state_dict[std::format (" {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
539539 std::to_string (idx), Block::kMlpLayerName , MLP::kCProjLayerName ,
540540 tp::RowParallelLinear::kParamWeightName )];
541541 ReadMatrixColShardFloat (ifs, static_cast <float *>(tensor->DataPtr ()), n_embd, fc_out, rank * in4_pp, in4_pp);
542542 }
543543 // transformer.h.{i}.mlp.c_proj.bias (RowParallelLinear, no shard on bias)
544- for (int idx = 0 ; idx < n_layer; idx++ ) {
544+ for (int idx = 0 ; idx < n_layer; ++idx ) {
545545 auto &tensor = state_dict[std::format (" {}.{}.{}.{}.{}.{}" , GPT2::kTransformerLayerName , GPT2::kHLayerName ,
546546 std::to_string (idx), Block::kMlpLayerName , MLP::kCProjLayerName ,
547547 tp::RowParallelLinear::kParamBiasName )];
0 commit comments