58
58
#include < iostream>
59
59
#include < limits>
60
60
61
- #ifdef CUB_MKL
62
- #include < mkl.h>
63
- #endif
61
+ #include < mkl.h>
64
62
65
63
#include " sparse_matrix.h"
66
64
#include " utils.h"
@@ -366,8 +364,11 @@ float TestOmpMergeCsrmv(
366
364
ValueT* vector_x,
367
365
ValueT* reference_vector_y_out,
368
366
ValueT* vector_y_out,
369
- int timing_iterations)
367
+ int timing_iterations,
368
+ float &setup_ms)
370
369
{
370
+ setup_ms = 0.0 ;
371
+
371
372
if (g_omp_threads == -1 )
372
373
g_omp_threads = omp_get_num_procs ();
373
374
int num_threads = g_omp_threads;
@@ -383,24 +384,25 @@ float TestOmpMergeCsrmv(
383
384
// Check answer
384
385
int compare = CompareResults (reference_vector_y_out, vector_y_out, a.num_rows , true );
385
386
printf (" \t %s\n " , compare ? " FAIL" : " PASS" ); fflush (stdout);
386
-
387
- // Re-populate caches, etc.
388
- memset (vector_y_out, -1 , sizeof (ValueT) * a.num_rows );
389
- OmpMergeCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
390
387
}
388
+
389
+ // Re-populate caches, etc.
390
+ OmpMergeCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
391
+ OmpMergeCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
392
+ OmpMergeCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
391
393
392
394
// Timing
393
- float elapsed_millis = 0.0 ;
395
+ float elapsed_ms = 0.0 ;
394
396
CpuTimer timer;
395
397
timer.Start ();
396
398
for (int it = 0 ; it < timing_iterations; ++it)
397
399
{
398
400
OmpMergeCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
399
401
}
400
402
timer.Stop ();
401
- elapsed_millis += timer.ElapsedMillis ();
403
+ elapsed_ms += timer.ElapsedMillis ();
402
404
403
- return elapsed_millis / timing_iterations;
405
+ return elapsed_ms / timing_iterations;
404
406
}
405
407
406
408
@@ -452,8 +454,11 @@ float TestMklCsrmv(
452
454
ValueT* vector_x,
453
455
ValueT* reference_vector_y_out,
454
456
ValueT* vector_y_out,
455
- int timing_iterations)
457
+ int timing_iterations,
458
+ float &setup_ms)
456
459
{
460
+ setup_ms = 0.0 ;
461
+
457
462
// Warmup/correctness
458
463
memset (vector_y_out, -1 , sizeof (ValueT) * a.num_rows );
459
464
MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
@@ -463,25 +468,29 @@ float TestMklCsrmv(
463
468
int compare = CompareResults (reference_vector_y_out, vector_y_out, a.num_rows , true );
464
469
printf (" \t %s\n " , compare ? " FAIL" : " PASS" ); fflush (stdout);
465
470
466
- // Re-populate caches, etc.
467
- memset (vector_y_out, -1 , sizeof (ValueT) * a.num_rows );
468
- MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
471
+ // memset(vector_y_out, -1, sizeof(ValueT) * a.num_rows);
469
472
}
470
473
474
+ // Re-populate caches, etc.
475
+ MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
476
+ MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
477
+ MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
478
+
471
479
// Timing
472
- float elapsed_millis = 0.0 ;
480
+ float elapsed_ms = 0.0 ;
473
481
CpuTimer timer;
474
482
timer.Start ();
475
483
for (int it = 0 ; it < timing_iterations; ++it)
476
484
{
477
485
MklCsrmv (g_omp_threads, a, a.row_offsets + 1 , a.column_indices , a.values , vector_x, vector_y_out);
478
486
}
479
487
timer.Stop ();
480
- elapsed_millis += timer.ElapsedMillis ();
488
+ elapsed_ms += timer.ElapsedMillis ();
481
489
482
- return elapsed_millis / timing_iterations;
490
+ return elapsed_ms / timing_iterations;
483
491
}
484
492
493
+
485
494
// ---------------------------------------------------------------------
486
495
// Test generation
487
496
// ---------------------------------------------------------------------
@@ -491,25 +500,27 @@ float TestMklCsrmv(
491
500
*/
492
501
template <typename ValueT, typename OffsetT>
493
502
void DisplayPerf (
494
- double avg_millis,
503
+ double setup_ms,
504
+ double avg_ms,
495
505
CsrMatrix<ValueT, OffsetT>& csr_matrix)
496
506
{
497
507
double nz_throughput, effective_bandwidth;
498
508
size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof (ValueT) * 2 + sizeof (OffsetT))) +
499
509
(csr_matrix.num_rows ) * (sizeof (OffsetT) + sizeof (ValueT));
500
510
501
- nz_throughput = double (csr_matrix.num_nonzeros ) / avg_millis / 1.0e6 ;
502
- effective_bandwidth = double (total_bytes) / avg_millis / 1.0e6 ;
511
+ nz_throughput = double (csr_matrix.num_nonzeros ) / avg_ms / 1.0e6 ;
512
+ effective_bandwidth = double (total_bytes) / avg_ms / 1.0e6 ;
503
513
504
514
if (!g_quiet)
505
- printf (" fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s\n " ,
515
+ printf (" fp%d: %.4f setup ms, %.4f avg ms, %.5f gflops, %.3lf effective GB/s\n " ,
506
516
int (sizeof (ValueT) * 8 ),
507
- avg_millis,
517
+ setup_ms,
518
+ avg_ms,
508
519
2 * nz_throughput,
509
520
effective_bandwidth);
510
521
else
511
- printf (" %.5f, %.6f, %.3lf, " ,
512
- avg_millis ,
522
+ printf (" %.5f, %.5f, %. 6f, %.3lf, " ,
523
+ setup_ms, avg_ms ,
513
524
2 * nz_throughput,
514
525
effective_bandwidth);
515
526
@@ -540,14 +551,14 @@ void RunTests(
540
551
if (!mtx_filename.empty ())
541
552
{
542
553
// Parse matrix market file
543
- printf (" %s, " , mtx_filename.c_str ()); fflush (stdout);
544
554
coo_matrix.InitMarket (mtx_filename, 1.0 , !g_quiet);
545
555
546
556
if ((coo_matrix.num_rows == 1 ) || (coo_matrix.num_cols == 1 ) || (coo_matrix.num_nonzeros == 1 ))
547
557
{
548
558
if (!g_quiet) printf (" Trivial dataset\n " );
549
559
exit (0 );
550
560
}
561
+ printf (" %s, " , mtx_filename.c_str ()); fflush (stdout);
551
562
}
552
563
else if (grid2d > 0 )
553
564
{
@@ -599,7 +610,7 @@ void RunTests(
599
610
// Determine # of timing iterations (aim to run 16 billion nonzeros through, total)
600
611
if (timing_iterations == -1 )
601
612
{
602
- timing_iterations = std::min (50000ull , std::max (100ull , ((16ull << 30 ) / csr_matrix.num_nonzeros )));
613
+ timing_iterations = std::min (200000ull , std::max (100ull , ((16ull << 30 ) / csr_matrix.num_nonzeros )));
603
614
if (!g_quiet)
604
615
printf (" \t %d timing iterations\n " , timing_iterations);
605
616
}
@@ -631,21 +642,19 @@ void RunTests(
631
642
// Compute reference answer
632
643
SpmvGold (csr_matrix, vector_x, vector_y_in, reference_vector_y_out, alpha, beta);
633
644
634
- float avg_millis ;
645
+ float avg_ms, setup_ms ;
635
646
636
- #ifdef CUB_MKL
637
647
// MKL SpMV
638
648
if (!g_quiet) printf (" \n\n " );
639
649
printf (" MKL CsrMV, " ); fflush (stdout);
640
- avg_millis = TestMklCsrmv (csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations);
641
- DisplayPerf (avg_millis, csr_matrix);
642
- #endif
650
+ avg_ms = TestMklCsrmv (csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations, setup_ms);
651
+ DisplayPerf (setup_ms, avg_ms, csr_matrix);
643
652
644
653
// Merge SpMV
645
654
if (!g_quiet) printf (" \n\n " );
646
655
printf (" Merge CsrMV, " ); fflush (stdout);
647
- avg_millis = TestOmpMergeCsrmv (csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations);
648
- DisplayPerf (avg_millis , csr_matrix);
656
+ avg_ms = TestOmpMergeCsrmv (csr_matrix, vector_x, reference_vector_y_out, vector_y_out, timing_iterations, setup_ms );
657
+ DisplayPerf (setup_ms, avg_ms , csr_matrix);
649
658
650
659
// Cleanup
651
660
if (csr_matrix.IsNumaMalloc ())
0 commit comments