Skip to content

Commit 7b06481

Browse files
committed
timing formatting
1 parent 9e1ed2b commit 7b06481

7 files changed

+127
-86
lines changed

.gitignore

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
cpu_spmv
1+
_cpu_spmv_driver
2+
_gpu_spmv_driver
23
mtx
3-
gpu_spmv

Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -153,21 +153,21 @@ DEPS = $(call rwildcard, $(CUB_DIR),*.cuh) \
153153
#-------------------------------------------------------------------------------
154154

155155
clean :
156-
rm -f gpu_spmv cpu_spmv
156+
rm -f _gpu_spmv_driver _cpu_spmv_driver
157157

158158

159159
#-------------------------------------------------------------------------------
160160
# make gpu_spmv
161161
#-------------------------------------------------------------------------------
162162

163163
gpu_spmv : gpu_spmv.cu $(DEPS)
164-
$(NVCC) $(DEFINES) $(SM_TARGETS) -o gpu_spmv gpu_spmv.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse -O3
164+
$(NVCC) $(DEFINES) $(SM_TARGETS) -o _gpu_spmv_driver gpu_spmv.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse -O3
165165

166166

167167
#-------------------------------------------------------------------------------
168168
# make cpu_spmv
169169
#-------------------------------------------------------------------------------
170170

171171
cpu_spmv : cpu_spmv.cpp $(DEPS)
172-
$(OMPCC) $(DEFINES) -DCUB_MKL -o cpu_spmv cpu_spmv.cpp $(OMPCC_FLAGS)
172+
$(OMPCC) $(DEFINES) -DCUB_MKL -o _cpu_spmv_driver cpu_spmv.cpp $(OMPCC_FLAGS)
173173

cpu_spmv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
KMP_AFFINITY=granularity=core,scatter
4+
./_cpu_spmv_driver $@

cpu_spmv.cpp

+43-34
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,7 @@
5858
#include <iostream>
5959
#include <limits>
6060

61-
#ifdef CUB_MKL
62-
#include <mkl.h>
63-
#endif
61+
#include <mkl.h>
6462

6563
#include "sparse_matrix.h"
6664
#include "utils.h"
@@ -366,8 +364,11 @@ float TestOmpMergeCsrmv(
366364
ValueT* vector_x,
367365
ValueT* reference_vector_y_out,
368366
ValueT* vector_y_out,
369-
int timing_iterations)
367+
int timing_iterations,
368+
float &setup_ms)
370369
{
370+
setup_ms = 0.0;
371+
371372
if (g_omp_threads == -1)
372373
g_omp_threads = omp_get_num_procs();
373374
int num_threads = g_omp_threads;
@@ -383,24 +384,25 @@ float TestOmpMergeCsrmv(
383384
// Check answer
384385
int compare = CompareResults(reference_vector_y_out, vector_y_out, a.num_rows, true);
385386
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
386-
387-
// Re-populate caches, etc.
388-
memset(vector_y_out, -1, sizeof(ValueT) * a.num_rows);
389-
OmpMergeCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
390387
}
388+
389+
// Re-populate caches, etc.
390+
OmpMergeCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
391+
OmpMergeCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
392+
OmpMergeCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
391393

392394
// Timing
393-
float elapsed_millis = 0.0;
395+
float elapsed_ms = 0.0;
394396
CpuTimer timer;
395397
timer.Start();
396398
for(int it = 0; it < timing_iterations; ++it)
397399
{
398400
OmpMergeCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
399401
}
400402
timer.Stop();
401-
elapsed_millis += timer.ElapsedMillis();
403+
elapsed_ms += timer.ElapsedMillis();
402404

403-
return elapsed_millis / timing_iterations;
405+
return elapsed_ms / timing_iterations;
404406
}
405407

406408

@@ -452,8 +454,11 @@ float TestMklCsrmv(
452454
ValueT* vector_x,
453455
ValueT* reference_vector_y_out,
454456
ValueT* vector_y_out,
455-
int timing_iterations)
457+
int timing_iterations,
458+
float &setup_ms)
456459
{
460+
setup_ms = 0.0;
461+
457462
// Warmup/correctness
458463
memset(vector_y_out, -1, sizeof(ValueT) * a.num_rows);
459464
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
@@ -463,25 +468,29 @@ float TestMklCsrmv(
463468
int compare = CompareResults(reference_vector_y_out, vector_y_out, a.num_rows, true);
464469
printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
465470

466-
// Re-populate caches, etc.
467-
memset(vector_y_out, -1, sizeof(ValueT) * a.num_rows);
468-
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
471+
// memset(vector_y_out, -1, sizeof(ValueT) * a.num_rows);
469472
}
470473

474+
// Re-populate caches, etc.
475+
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
476+
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
477+
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
478+
471479
// Timing
472-
float elapsed_millis = 0.0;
480+
float elapsed_ms = 0.0;
473481
CpuTimer timer;
474482
timer.Start();
475483
for(int it = 0; it < timing_iterations; ++it)
476484
{
477485
MklCsrmv(g_omp_threads, a, a.row_offsets + 1, a.column_indices, a.values, vector_x, vector_y_out);
478486
}
479487
timer.Stop();
480-
elapsed_millis += timer.ElapsedMillis();
488+
elapsed_ms += timer.ElapsedMillis();
481489

482-
return elapsed_millis / timing_iterations;
490+
return elapsed_ms / timing_iterations;
483491
}
484492

493+
485494
//---------------------------------------------------------------------
486495
// Test generation
487496
//---------------------------------------------------------------------
@@ -491,25 +500,27 @@ float TestMklCsrmv(
491500
*/
492501
template <typename ValueT, typename OffsetT>
493502
void DisplayPerf(
494-
double avg_millis,
503+
double setup_ms,
504+
double avg_ms,
495505
CsrMatrix<ValueT, OffsetT>& csr_matrix)
496506
{
497507
double nz_throughput, effective_bandwidth;
498508
size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
499509
(csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
500510

501-
nz_throughput = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
502-
effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
511+
nz_throughput = double(csr_matrix.num_nonzeros) / avg_ms / 1.0e6;
512+
effective_bandwidth = double(total_bytes) / avg_ms / 1.0e6;
503513

504514
if (!g_quiet)
505-
printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s\n",
515+
printf("fp%d: %.4f setup ms, %.4f avg ms, %.5f gflops, %.3lf effective GB/s\n",
506516
int(sizeof(ValueT) * 8),
507-
avg_millis,
517+
setup_ms,
518+
avg_ms,
508519
2 * nz_throughput,
509520
effective_bandwidth);
510521
else
511-
printf("%.5f, %.6f, %.3lf, ",
512-
avg_millis,
522+
printf("%.5f, %.5f, %.6f, %.3lf, ",
523+
setup_ms, avg_ms,
513524
2 * nz_throughput,
514525
effective_bandwidth);
515526

@@ -540,14 +551,14 @@ void RunTests(
540551
if (!mtx_filename.empty())
541552
{
542553
// Parse matrix market file
543-
printf("%s, ", mtx_filename.c_str()); fflush(stdout);
544554
coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
545555

546556
if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
547557
{
548558
if (!g_quiet) printf("Trivial dataset\n");
549559
exit(0);
550560
}
561+
printf("%s, ", mtx_filename.c_str()); fflush(stdout);
551562
}
552563
else if (grid2d > 0)
553564
{
@@ -599,7 +610,7 @@ void RunTests(
599610
// Determine # of timing iterations (aim to run 16 billion nonzeros through, total)
600611
if (timing_iterations == -1)
601612
{
602-
timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / csr_matrix.num_nonzeros)));
613+
timing_iterations = std::min(200000ull, std::max(100ull, ((16ull << 30) / csr_matrix.num_nonzeros)));
603614
if (!g_quiet)
604615
printf("\t%d timing iterations\n", timing_iterations);
605616
}
@@ -631,21 +642,19 @@ void RunTests(
631642
// Compute reference answer
632643
SpmvGold(csr_matrix, vector_x, vector_y_in, reference_vector_y_out, alpha, beta);
633644

634-
float avg_millis;
645+
float avg_ms, setup_ms;
635646

636-
#ifdef CUB_MKL
637647
// MKL SpMV
638648
if (!g_quiet) printf("\n\n");
639649
printf("MKL CsrMV, "); fflush(stdout);
640-
avg_millis = TestMklCsrmv(csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations);
641-
DisplayPerf(avg_millis, csr_matrix);
642-
#endif
650+
avg_ms = TestMklCsrmv(csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations, setup_ms);
651+
DisplayPerf(setup_ms, avg_ms, csr_matrix);
643652

644653
// Merge SpMV
645654
if (!g_quiet) printf("\n\n");
646655
printf("Merge CsrMV, "); fflush(stdout);
647-
avg_millis = TestOmpMergeCsrmv(csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations);
648-
DisplayPerf(avg_millis, csr_matrix);
656+
avg_ms = TestOmpMergeCsrmv(csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations, setup_ms);
657+
DisplayPerf(setup_ms, avg_ms, csr_matrix);
649658

650659
// Cleanup
651660
if (csr_matrix.IsNumaMalloc())

eval_csrmv.sh

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
if (( $# != 2 )); then
4+
echo "$0 <mtx dataset dir> <cpu_spmv | gpu_spmv [--device=...]>"
5+
exit 0
6+
fi
7+
8+
echo "file, num_rows, num_cols, num_nonzeros, row_length_mean, row_length_std_dev, row_length_variation, row_length_skewness, method_name, setup_ms, avg_spmv_ms, gflops, effective_GBs"
9+
10+
MTX_DIR=$1
11+
12+
shift
13+
14+
for i in `find $MTX_DIR -name *.mtx`
15+
do
16+
./$@ --quiet --mtx=$i
17+
done

gpu_spmv

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
./_gpu_spmv_driver $@

0 commit comments

Comments
 (0)