diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py index 4113d7819..de2edfda9 100644 --- a/profile_gpt2cu.py +++ b/profile_gpt2cu.py @@ -50,8 +50,6 @@ # model config CLS_START = -1 CLS_NUM = 6 -NORM_ID = 44 -ADAM_ID = 45 N_LAYERS = 12 summaries = defaultdict(lambda: 0.0) @@ -132,7 +130,7 @@ # the classifier part, counts only once pass_name = "cls" phase = "bwd" - elif "adamw" in kernel: + elif "adamw" in kernel or "global_norm" in kernel: # encoder layer or adam pass_name = "opt" # before the first optimizer run, we create weight copies. diff --git a/train_gpt2.cu b/train_gpt2.cu index 088afd69c..05a0ba2fc 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -1201,18 +1201,13 @@ __global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg } template -__global__ void norm_kernel(float* out, const T* data, size_t count) { +__global__ void global_norm_kernel(float* out, const T* data, size_t count) { // we want as few atomics as possible, so each block tries to do // the maximum amount of work (so no fixed chunk, but instead iterating // until we run out of data), and then we reduce inside the block // and finally have just one atomic per block. - namespace cg = cooperative_groups; - cg::thread_block block = cg::this_thread_block(); - cg::thread_block_tile<32> warp = cg::tiled_partition<32>(block); - - __shared__ float block_result[32]; - - // out will be updated atomically from all thread blocks + // out will be updated atomically from all thread blocks. It is a float, so the + // atomic op is unproblematic size_t index = threadIdx.x + blockDim.x * blockIdx.x; size_t grid_width = blockDim.x * gridDim.x; float accumulator = 0.f; @@ -1220,15 +1215,9 @@ __global__ void norm_kernel(float* out, const T* data, size_t count) { accumulator += (float)data[i] * (float)data[i]; } // warp-level reduce - float warp_result = cg::reduce(warp, accumulator, cg::plus{}); - block_result[warp.meta_group_rank()] = warp_result; - block.sync(); - if(warp.meta_group_rank() == 0) { - float gather = warp.thread_rank() < warp.meta_group_size() ? block_result[warp.thread_rank()] : 0.f; - float block_sum = cg::reduce(warp, gather, cg::plus{}); - if(warp.thread_rank() == 0) { - atomicAdd(out, block_sum); - } + float block_sum = blockReduce(accumulator); + if(threadIdx.x == 0) { + atomicAdd(out, block_sum); } } @@ -1716,9 +1705,9 @@ void global_norm(float* out, const T* values, size_t count) { // one block too many is catastrophic, since it only can start once all the other // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512 // on all gpus, so the division really is going to be exact. - const int grid_size = cuda_threads_per_SM * cuda_num_SMs / block_size; + const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size; assert(grid_size > 0); // gives a better error than letting the call below fail - norm_kernel<<>>(out, values, count); + global_norm_kernel<<>>(out, values, count); cudaCheck(cudaGetLastError()); }