Skip to content

Commit eff76d7

Browse files
author
Jeff Hammond
committed
add onemkl gemm test that supports SP/DP and all devices
1 parent 7c1b698 commit eff76d7

File tree

1 file changed

+263
-0
lines changed

1 file changed

+263
-0
lines changed

Cxx11/xgemm-onemkl.cc

+263
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
///
2+
/// Copyright (c) 2020, Intel Corporation
3+
///
4+
/// Redistribution and use in source and binary forms, with or without
5+
/// modification, are permitted provided that the following conditions
6+
/// are met:
7+
///
8+
/// * Redistributions of source code must retain the above copyright
9+
/// notice, this list of conditions and the following disclaimer.
10+
/// * Redistributions in binary form must reproduce the above
11+
/// copyright notice, this list of conditions and the following
12+
/// disclaimer in the documentation and/or other materials provided
13+
/// with the distribution.
14+
/// * Neither the name of Intel Corporation nor the names of its
15+
/// contributors may be used to endorse or promote products
16+
/// derived from this software without specific prior written
17+
/// permission.
18+
///
19+
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20+
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21+
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22+
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23+
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24+
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25+
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26+
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28+
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29+
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30+
/// POSSIBILITY OF SUCH DAMAGE.
31+
32+
//////////////////////////////////////////////////////////////////////
33+
///
34+
/// NAME: gemm
35+
///
36+
/// PURPOSE: This program tests the efficiency with which a dense matrix
37+
/// dense multiplication is carried out
38+
///
39+
/// USAGE: The program takes as input the matrix order,
40+
/// the number of times the matrix-matrix multiplication
41+
/// is carried out, and, optionally, a tile size for matrix
42+
/// blocking
43+
///
44+
/// <progname> <# iterations> <matrix order>
45+
///
46+
/// The output consists of diagnostics to make sure the
47+
/// algorithm worked, and of timing statistics.
48+
///
49+
/// FUNCTIONS CALLED:
50+
///
51+
/// Other than OpenMP or standard C functions, the following
52+
/// functions are used in this program:
53+
///
54+
/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
55+
/// Converted to C++11 by Jeff Hammond, December, 2017.
56+
///
57+
//////////////////////////////////////////////////////////////////////
58+
59+
#include "prk_sycl.h"
60+
#include "prk_util.h"
61+
62+
#if BETA9 // and older
63+
#include <mkl_blas_sycl.hpp>
64+
#else
65+
#include <oneapi/mkl/blas.hpp>
66+
#endif
67+
68+
using namespace oneapi; // oneapi::mkl -> mkl
69+
70+
template <typename T>
71+
void run(sycl::queue & q, int iterations, int order)
72+
{
73+
double gemm_time{0};
74+
75+
const size_t nelems = (size_t)order * (size_t)order;
76+
const size_t bytes = nelems * sizeof(T);
77+
auto h_a = syclx::malloc_host<T>( nelems, q);
78+
auto h_b = syclx::malloc_host<T>( nelems, q);
79+
auto h_c = syclx::malloc_host<T>( nelems, q);
80+
81+
for (int i=0; i<order; ++i) {
82+
for (int j=0; j<order; ++j) {
83+
h_a[i*order+j] = i;
84+
h_b[i*order+j] = i;
85+
h_c[i*order+j] = 0;
86+
}
87+
}
88+
89+
// copy input from host to device
90+
auto A = syclx::malloc_device<T>( nelems, q);
91+
auto B = syclx::malloc_device<T>( nelems, q);
92+
auto C = syclx::malloc_device<T>( nelems, q);
93+
q.wait();
94+
95+
q.memcpy(A, &(h_a[0]), bytes).wait();
96+
q.memcpy(B, &(h_b[0]), bytes).wait();
97+
q.memcpy(C, &(h_c[0]), bytes).wait();
98+
q.wait();
99+
100+
syclx::free(h_a, q);
101+
syclx::free(h_b, q);
102+
103+
{
104+
for (int iter = 0; iter<=iterations; iter++) {
105+
106+
if (iter==1) gemm_time = prk::wtime();
107+
108+
const T alpha{1};
109+
const T beta{1};
110+
111+
mkl::blas::gemm(q, mkl::transpose::nontrans, // opA
112+
mkl::transpose::nontrans, // opB
113+
order, order, order, // m, n, k
114+
alpha, // alpha
115+
A, order, // A, lda
116+
B, order, // B, ldb
117+
beta, // beta
118+
C, order); // C, ldc
119+
q.wait();
120+
}
121+
gemm_time = prk::wtime() - gemm_time;
122+
}
123+
// copy output back to host
124+
q.memcpy(&(h_c[0]), C, bytes).wait();
125+
126+
syclx::free(C, q);
127+
syclx::free(B, q);
128+
syclx::free(A, q);
129+
130+
//////////////////////////////////////////////////////////////////////
131+
/// Analyze and output results
132+
//////////////////////////////////////////////////////////////////////
133+
134+
const double forder = static_cast<double>(order);
135+
const double reference = 0.25 * prk::pow(forder,3) * prk::pow(forder-1.0,2) * (iterations+1);
136+
double checksum{0};
137+
for (int i=0; i<nelems; ++i) {
138+
checksum += h_c[i];
139+
}
140+
const double residuum = std::abs(checksum - reference) / reference;
141+
const double epsilon{1.0e-8};
142+
if (residuum < epsilon) {
143+
#if VERBOSE
144+
std::cout << "Reference checksum = " << reference << "\n"
145+
<< "Actual checksum = " << checksum << std::endl;
146+
#endif
147+
std::cout << "Solution validates" << std::endl;
148+
auto avgtime = gemm_time/iterations;
149+
auto nflops = 2.0 * prk::pow(forder,3);
150+
std::cout << "FP" << 8*sizeof(T)
151+
<< "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
152+
<< " Avg time (s): " << avgtime << std::endl;
153+
} else {
154+
std::cout << "Reference checksum = " << reference << "\n"
155+
<< "Residuum = " << residuum << std::endl;
156+
}
157+
158+
syclx::free(h_c, q);
159+
}
160+
161+
int main(int argc, char * argv[])
162+
{
163+
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
164+
std::cout << "C++11/oneMKL Dense matrix-matrix multiplication: C += A x B" << std::endl;
165+
166+
//////////////////////////////////////////////////////////////////////
167+
/// Read and test input parameters
168+
//////////////////////////////////////////////////////////////////////
169+
170+
int iterations;
171+
int order;
172+
try {
173+
if (argc < 2) {
174+
throw "Usage: <# iterations> <matrix order>";
175+
}
176+
177+
iterations = std::atoi(argv[1]);
178+
if (iterations < 1) {
179+
throw "ERROR: iterations must be >= 1";
180+
}
181+
182+
order = std::atoi(argv[2]);
183+
if (order <= 0) {
184+
throw "ERROR: Matrix Order must be greater than 0";
185+
} else if (order > prk::get_max_matrix_size()) {
186+
throw "ERROR: matrix dimension too large - overflow risk";
187+
}
188+
}
189+
catch (const char * e) {
190+
std::cout << e << std::endl;
191+
return 1;
192+
}
193+
194+
std::cout << "Number of iterations = " << iterations << std::endl;
195+
std::cout << "Matrix order = " << order << std::endl;
196+
197+
//////////////////////////////////////////////////////////////////////
198+
/// Setup SYCL environment
199+
//////////////////////////////////////////////////////////////////////
200+
201+
try {
202+
sycl::queue q{sycl::host_selector{}};
203+
prk::SYCL::print_device_platform(q);
204+
run<float>(q, iterations, order);
205+
run<double>(q, iterations, order);
206+
}
207+
catch (sycl::exception & e) {
208+
std::cout << e.what() << std::endl;
209+
prk::SYCL::print_exception_details(e);
210+
}
211+
catch (std::exception & e) {
212+
std::cout << e.what() << std::endl;
213+
}
214+
catch (const char * e) {
215+
std::cout << e << std::endl;
216+
}
217+
218+
try {
219+
sycl::queue q{sycl::cpu_selector{}};
220+
prk::SYCL::print_device_platform(q);
221+
run<float>(q, iterations, order);
222+
run<double>(q, iterations, order);
223+
}
224+
catch (sycl::exception & e) {
225+
std::cout << e.what() << std::endl;
226+
prk::SYCL::print_exception_details(e);
227+
}
228+
catch (std::exception & e) {
229+
std::cout << e.what() << std::endl;
230+
}
231+
catch (const char * e) {
232+
std::cout << e << std::endl;
233+
}
234+
235+
try {
236+
sycl::queue q{sycl::gpu_selector{}};
237+
prk::SYCL::print_device_platform(q);
238+
bool has_fp64 = prk::SYCL::has_fp64(q);
239+
if (has_fp64) {
240+
if (prk::SYCL::print_gen12lp_helper(q)) return 1;
241+
}
242+
run<float>(q, iterations, order);
243+
if (has_fp64) {
244+
run<double>(q, iterations, order);
245+
} else {
246+
std::cout << "SYCL GPU device lacks FP64 support." << std::endl;
247+
}
248+
}
249+
catch (sycl::exception & e) {
250+
std::cout << e.what() << std::endl;
251+
prk::SYCL::print_exception_details(e);
252+
}
253+
catch (std::exception & e) {
254+
std::cout << e.what() << std::endl;
255+
}
256+
catch (const char * e) {
257+
std::cout << e << std::endl;
258+
}
259+
260+
return 0;
261+
}
262+
263+

0 commit comments

Comments
 (0)