Skip to content

Commit d8e2a36

Browse files
committed
ok crossentropy is fairly ez, just have to be careful with logs of negative numbers in debug tests. so this should be all the layers now
1 parent 46ee0a3 commit d8e2a36

File tree

1 file changed

+198
-0
lines changed

1 file changed

+198
-0
lines changed

dev/cuda/crossentropy_forward.cu

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/*
2+
Kernels for crossentropy forward pass.
3+
4+
Compile example:
5+
nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward
6+
7+
version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
8+
./crossentropy_forward 1
9+
*/
10+
11+
#include <stdio.h>
12+
#include <stdlib.h>
13+
#include <cuda_runtime.h>
14+
15+
// ----------------------------------------------------------------------------
16+
// CUDA utils
17+
18+
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
19+
20+
// error checking
21+
void cudaCheck(cudaError_t error, const char *file, int line) {
22+
if (error != cudaSuccess) {
23+
printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line,
24+
cudaGetErrorString(error));
25+
exit(EXIT_FAILURE);
26+
}
27+
};
28+
#define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
29+
30+
// ----------------------------------------------------------------------------
31+
// CPU code reference
32+
33+
void crossentropy_forward_cpu(float* losses,
34+
float* probs, int* targets,
35+
int B, int T, int V) {
36+
// output: losses is (B,T) of the individual losses at each position
37+
// input: probs are (B,T,V) of the probabilities
38+
// input: targets is (B,T) of integers giving the correct index in logits
39+
for (int b = 0; b < B; b++) {
40+
for (int t = 0; t < T; t++) {
41+
// loss = -log(probs[target])
42+
float* probs_bt = probs + b * T * V + t * V;
43+
int ix = targets[b * T + t];
44+
losses[b * T + t] = -logf(probs_bt[ix]);
45+
}
46+
}
47+
}
48+
49+
// ----------------------------------------------------------------------------
50+
// GPU kernels
51+
52+
__global__ void crossentropy_forward_kernel1(float* losses,
53+
float* probs, int* targets,
54+
int B, int T, int V) {
55+
int i = blockIdx.x * blockDim.x + threadIdx.x;
56+
if (i < B * T) {
57+
int b = i / T;
58+
int t = i % T;
59+
float* probs_bt = probs + b * T * V + t * V;
60+
int ix = targets[b * T + t];
61+
losses[b * T + t] = -logf(probs_bt[ix]);
62+
}
63+
}
64+
65+
// ----------------------------------------------------------------------------
66+
// kernel launcher
67+
68+
void crossentropy_forward1(float* losses,
69+
float* probs, int* targets,
70+
int B, int T, int V,
71+
const int block_size) {
72+
const int N = B * T;
73+
const int grid_size = CEIL_DIV(N, block_size);
74+
crossentropy_forward_kernel1<<<grid_size, block_size>>>(losses, probs, targets, B, T, V);
75+
cudaCheck(cudaGetLastError());
76+
}
77+
78+
// kernel version dispatch
79+
void crossentropy_forward(int kernel_num,
80+
float* losses,
81+
float* probs, int* targets,
82+
int B, int T, int V,
83+
const int block_size) {
84+
switch (kernel_num) {
85+
case 1:
86+
crossentropy_forward1(losses, probs, targets, B, T, V, block_size);
87+
break;
88+
default:
89+
printf("Invalid kernel number\n");
90+
exit(1);
91+
}
92+
}
93+
94+
// ----------------------------------------------------------------------------
95+
// random utils
96+
97+
float* make_random_float(int N) {
98+
float* arr = (float*)malloc(N * sizeof(float));
99+
for (int i = 0; i < N; i++) {
100+
arr[i] = ((float)rand() / RAND_MAX); // [0,1)
101+
}
102+
return arr;
103+
}
104+
105+
int* make_random_int(int N, int V) {
106+
int* arr = (int*)malloc(N * sizeof(int));
107+
for (int i = 0; i < N; i++) {
108+
arr[i] = rand() % V;
109+
}
110+
return arr;
111+
}
112+
113+
// ----------------------------------------------------------------------------
114+
115+
int main(int argc, char **argv) {
116+
srand(0);
117+
118+
int B = 8;
119+
int T = 1024;
120+
int V = 50257;
121+
122+
int deviceIdx = 0;
123+
cudaCheck(cudaSetDevice(deviceIdx));
124+
125+
// create host memory of random numbers
126+
float* out = (float*)malloc(B * T * sizeof(float));
127+
float* probs = make_random_float(B * T * V);
128+
int* targets = make_random_int(B * T, V);
129+
130+
// move to GPU
131+
float* d_out;
132+
float* d_probs;
133+
int* d_targets;
134+
cudaCheck(cudaMalloc(&d_out, B * T * sizeof(float)));
135+
cudaCheck(cudaMalloc(&d_probs, B * T * V * sizeof(float)));
136+
cudaCheck(cudaMalloc(&d_targets, B * T * sizeof(int)));
137+
cudaCheck(cudaMemcpy(d_probs, probs, B * T * V * sizeof(float), cudaMemcpyHostToDevice));
138+
cudaCheck(cudaMemcpy(d_targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
139+
140+
// read kernel_num from command line
141+
int kernel_num = 1;
142+
if (argc > 1) {
143+
kernel_num = atoi(argv[1]);
144+
}
145+
printf("Using kernel %d\n", kernel_num);
146+
147+
// first check the correctness of the kernel
148+
crossentropy_forward_cpu(out, probs, targets, B, T, V);
149+
crossentropy_forward(kernel_num, d_out, d_probs, d_targets, B, T, V, 256);
150+
float* out_gpu = (float*)malloc(B * T * sizeof(float));
151+
cudaCheck(cudaMemcpy(out_gpu, d_out, B * T * sizeof(float), cudaMemcpyDeviceToHost));
152+
for (int i = 0; i < B * T; i++) {
153+
// print the first few comparisons
154+
if (i < 10) {
155+
printf("%f %f\n", out[i], out_gpu[i]);
156+
}
157+
// ensure correctness for all elements
158+
if (fabs(out[i] - out_gpu[i]) > 1e-5) {
159+
printf("Mismatch at %d: %f vs %f\n", i, out[i], out_gpu[i]);
160+
exit(1);
161+
}
162+
}
163+
printf("Results match at block_size=256!\n");
164+
165+
// time the kernel at different block sizes
166+
int block_sizes[] = {32, 64, 128, 256, 512, 1024};
167+
168+
for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
169+
int block_size = block_sizes[j];
170+
171+
int repeat_times = 1000;
172+
cudaEvent_t start, stop;
173+
cudaCheck(cudaEventCreate(&start));
174+
cudaCheck(cudaEventCreate(&stop));
175+
cudaCheck(cudaEventRecord(start, 0));
176+
for (int i = 0; i < repeat_times; i++) {
177+
crossentropy_forward(kernel_num, d_out, d_probs, d_targets, B, T, V, block_size);
178+
}
179+
cudaCheck(cudaEventRecord(stop, 0));
180+
cudaCheck(cudaEventSynchronize(start));
181+
cudaCheck(cudaEventSynchronize(stop));
182+
float elapsed_time;
183+
cudaCheck(cudaEventElapsedTime(&elapsed_time, start, stop));
184+
185+
printf("block_size %4d | time %f ms\n", block_size, elapsed_time / repeat_times);
186+
}
187+
188+
// free memory
189+
free(out);
190+
free(probs);
191+
free(targets);
192+
free(out_gpu);
193+
cudaCheck(cudaFree(d_out));
194+
cudaCheck(cudaFree(d_probs));
195+
cudaCheck(cudaFree(d_targets));
196+
197+
return 0;
198+
}

0 commit comments

Comments
 (0)