29
29
30
30
#include < cstdint>
31
31
32
- #ifndef BARE_METAL
33
- #include < thread>
34
- #endif // ifndef BARE_METAL
35
-
36
32
namespace arm_compute
37
33
{
38
34
namespace test
@@ -41,169 +37,114 @@ namespace validation
41
37
{
42
38
43
39
namespace {
44
- constexpr int NUM_THREADS = 3 ;
45
40
46
41
template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false , bool run_twice = false >
47
- void compute_cpugemmlowp_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
42
+ TensorType compute_cpugemmlowp_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo,
48
43
const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8,
49
44
GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(),
50
- bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN, int num_parallel_runs = 1, TensorType targets[NUM_THREADS] = {} )
45
+ bool accumulate = false, bool dynamic_qinfo = false, DataType data_type_output = DataType::UNKNOWN)
51
46
{
52
47
ARM_COMPUTE_ASSERT (is_data_type_quantized_asymmetric (data_type_a));
53
- ARM_COMPUTE_ASSERT (num_parallel_runs > 1 ? run_twice == false : true );
54
48
55
49
// If unknown, set to sensible defaults
56
50
if (data_type_output == DataType::UNKNOWN) {
57
51
data_type_output = output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : data_type_a;
58
52
}
59
53
60
54
// Create tensors
61
- TensorType a[NUM_THREADS];
62
- TensorType b[NUM_THREADS];
63
- TensorType output[NUM_THREADS];
64
- TensorType *out_ptrs[NUM_THREADS];
65
- TensorType bias[NUM_THREADS];
66
-
67
- for (int i = 0 ; i < num_parallel_runs; ++i){
68
- a[i] = create_tensor<TensorType>(shape_a, data_type_a, 1 , dynamic_qinfo ? QuantizationInfo (1.0 ,0 ,true ) : a_qinfo);
69
- b[i] = create_tensor<TensorType>(shape_b, data_type_b, 1 , dynamic_qinfo ? QuantizationInfo (1.0 ,0 ,true ) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
70
- output[i] = create_tensor<TensorType>(shape_output, data_type_output, 1 , output_qinfo /* output_qinfo will be ignored when output stage type is None */ );
71
- out_ptrs[i] = &output[i];
55
+ TensorType a = create_tensor<TensorType>(shape_a, data_type_a, 1 , dynamic_qinfo ? QuantizationInfo (1.0 ,0 ,true ) : a_qinfo);
56
+ TensorType b = create_tensor<TensorType>(shape_b, data_type_b, 1 , dynamic_qinfo ? QuantizationInfo (1.0 ,0 ,true ) : b_qinfo); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
57
+ TensorType output = create_tensor<TensorType>(shape_output, data_type_output, 1 , output_qinfo /* output_qinfo will be ignored when output stage type is None */ );
58
+ TensorType bias;
72
59
73
- if (is_fused)
74
- {
75
- TensorShape bias_shape (shape_b[0 ]);
76
- bias[i] = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1 );
77
- }
60
+ if (is_fused)
61
+ {
62
+ TensorShape bias_shape (shape_b[0 ]);
63
+ bias = create_tensor<TensorType>(bias_shape,data_type_output == DataType::F32 ? DataType::F32 : DataType::S32, 1 );
78
64
}
79
65
80
66
// Create and configure function
81
67
// The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
82
68
FunctionType gemmlowp;
83
- gemmlowp.configure (a[ 0 ] .info (), b[ 0 ] .info (), is_fused ? bias[ 0 ] .info () : nullptr , out_ptrs[ 0 ]-> info (), GEMMInfo (false , false , reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2 ] : 0 ), reinterpret_input_as_3d, false ,
69
+ gemmlowp.configure (a.info (), b.info (), is_fused ? bias.info () : nullptr , output. info (), GEMMInfo (false , false , reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2 ] : 0 ), reinterpret_input_as_3d, false ,
84
70
output_stage, false /* fp_mixed_precision*/ , false /* fast_math*/ , false /* broadcast_bias*/ ,
85
71
arm_compute::ActivationLayerInfo (), false /* fixed_format */ , arm_compute::WeightFormat::UNSPECIFIED,
86
72
false /* pretranspose_B */ , accumulate));
87
73
88
- for (int i = 0 ; i < num_parallel_runs; ++i)
74
+ // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
75
+ if (dynamic_qinfo)
89
76
{
90
- // If the QuantizationInfo is dynamic, it needs to be settable after configure (note that we also force it to be dynamic)
91
- if (dynamic_qinfo)
92
- {
93
- a[i].info ()->set_quantization_info (QuantizationInfo (a_qinfo.scale (), a_qinfo.offset (), true ));
94
- b[i].info ()->set_quantization_info (QuantizationInfo (b_qinfo.scale (), b_qinfo.offset (), true ));
95
- output[i].info ()->set_quantization_info (QuantizationInfo (output_qinfo.scale (), output_qinfo.offset (), true ));
96
- gemmlowp.update_quantization_parameters (a[i].info ()->quantization_info (),
97
- b[i].info ()->quantization_info (),
98
- output[i].info ()->quantization_info (),
99
- data_type_output,
100
- true , true );
101
- }
77
+ a.info ()->set_quantization_info (QuantizationInfo (a_qinfo.scale (), a_qinfo.offset (), true ));
78
+ b.info ()->set_quantization_info (QuantizationInfo (b_qinfo.scale (), b_qinfo.offset (), true ));
79
+ output.info ()->set_quantization_info (QuantizationInfo (output_qinfo.scale (), output_qinfo.offset (), true ));
80
+ gemmlowp.update_quantization_parameters (a.info ()->quantization_info (),
81
+ b.info ()->quantization_info (),
82
+ output.info ()->quantization_info (),
83
+ data_type_output,
84
+ true , true );
85
+ }
102
86
103
- ARM_COMPUTE_ASSERT (a[i] .info ()->is_resizable ());
104
- ARM_COMPUTE_ASSERT (b[i] .info ()->is_resizable ());
105
- ARM_COMPUTE_ASSERT (output[i] .info ()->is_resizable ());
87
+ ARM_COMPUTE_ASSERT (a.info ()->is_resizable ());
88
+ ARM_COMPUTE_ASSERT (b.info ()->is_resizable ());
89
+ ARM_COMPUTE_ASSERT (output.info ()->is_resizable ());
106
90
107
- add_padding_x ({ &a[i] , &b[i] , &output[i] });
91
+ add_padding_x ({ &a, &b, &output });
108
92
109
- // Allocate tensors
110
- a[i] .allocator ()->allocate ();
111
- b[i] .allocator ()->allocate ();
112
- output[i] .allocator ()->allocate ();
93
+ // Allocate tensors
94
+ a .allocator ()->allocate ();
95
+ b .allocator ()->allocate ();
96
+ output.allocator ()->allocate ();
113
97
114
- ARM_COMPUTE_ASSERT (!a[i].info ()->is_resizable ());
115
- ARM_COMPUTE_ASSERT (!b[i].info ()->is_resizable ());
116
- ARM_COMPUTE_ASSERT (!output[i].info ()->is_resizable ());
117
- }
98
+ ARM_COMPUTE_ASSERT (!a.info ()->is_resizable ());
99
+ ARM_COMPUTE_ASSERT (!b.info ()->is_resizable ());
100
+ ARM_COMPUTE_ASSERT (!output.info ()->is_resizable ());
118
101
119
- ITensorPack pack [NUM_THREADS];
102
+ ITensorPack pack =
103
+ {
104
+ { arm_compute::TensorType::ACL_SRC_0, &a },
105
+ { arm_compute::TensorType::ACL_SRC_1, &b },
106
+ { arm_compute::TensorType::ACL_DST, &output }
107
+ };
120
108
121
- #ifndef BARE_METAL
122
- std::vector<std::thread> threads;
109
+ // Fill tensors
110
+ fill_quantized (AccessorType (a), 0 + finfo.hash );
111
+ fill_quantized (AccessorType (b), 1 + finfo.hash );
123
112
124
- if (num_parallel_runs > 1 )
113
+ if (accumulate )
125
114
{
126
- threads.reserve (num_parallel_runs);
115
+ ARM_COMPUTE_ASSERT (accumulate != run_twice);
116
+ fill (AccessorType (output), 6 + finfo.hash , finfo.min_output , finfo.max_output );
127
117
}
128
- #endif // ifndef BARE_METAL
129
118
130
- for ( int i = 0 ; i < num_parallel_runs; ++i )
119
+ if (is_fused )
131
120
{
132
- // these are newly created every call of this lambda function
133
- pack[i] =
134
- {
135
- { arm_compute::TensorType::ACL_SRC_0, &a[i] },
136
- { arm_compute::TensorType::ACL_SRC_1, &b[i] },
137
- { arm_compute::TensorType::ACL_DST, out_ptrs[i] }
138
- };
121
+ ARM_COMPUTE_ASSERT (bias.info ()->is_resizable ());
122
+ bias.allocator ()->allocate ();
123
+ ARM_COMPUTE_ASSERT (!bias.info ()->is_resizable ());
124
+ fill (AccessorType (bias), 2 + finfo.hash , finfo.min_bias , finfo.max_bias );
125
+ pack.add_tensor (arm_compute::TensorType::ACL_SRC_2, &bias);
126
+ }
139
127
140
- // Fill tensors
141
- fill_quantized (AccessorType (a[i]), 0 + finfo.hash );
142
- fill_quantized (AccessorType (b[i]), 1 + finfo.hash );
128
+ auto mg = MemoryGroup{};
129
+ auto ws = manage_workspace<Tensor>(gemmlowp.workspace (), mg, pack, pack);
143
130
144
- if (accumulate)
145
- {
146
- ARM_COMPUTE_ASSERT (accumulate != run_twice);
147
- fill (AccessorType (output[i]), 6 + finfo.hash , finfo.min_output , finfo.max_output );
148
- }
131
+ // Run with variable inputs.
132
+ if (run_twice)
133
+ {
134
+ gemmlowp.run (pack);
135
+ fill_quantized (AccessorType (a), 3 + finfo.hash ); // Fill tensors with new seed after run
136
+ fill_quantized (AccessorType (b), 4 + finfo.hash );
149
137
150
138
if (is_fused)
151
139
{
152
- ARM_COMPUTE_ASSERT (bias[i].info ()->is_resizable ());
153
- bias[i].allocator ()->allocate ();
154
- ARM_COMPUTE_ASSERT (!bias[i].info ()->is_resizable ());
155
- fill (AccessorType (bias[i]), 2 + finfo.hash , finfo.min_bias , finfo.max_bias );
156
- pack[i].add_tensor (arm_compute::TensorType::ACL_SRC_2, &bias[i]);
157
- }
158
-
159
- // Run with variable inputs.
160
- if (run_twice)
161
- {
162
- auto mg = MemoryGroup{};
163
- auto ws = manage_workspace<Tensor>(gemmlowp.workspace (), mg, pack[i], pack[i]);
164
-
165
- gemmlowp.run (pack[i]);
166
- fill_quantized (AccessorType (a[i]), 3 + finfo.hash ); // Fill tensors with new seed after run
167
- fill_quantized (AccessorType (b[i]), 4 + finfo.hash );
168
- if (is_fused)
169
- {
170
- fill (AccessorType (bias[i]), 5 + finfo.hash , finfo.min_bias , finfo.max_bias );
171
- }
172
- }
173
-
174
- // Compute GEMM function
175
- #ifndef BARE_METAL
176
- if (num_parallel_runs > 1 )
177
- {
178
- threads.emplace_back ([&,i]
179
- {
180
- auto mg = MemoryGroup{};
181
- auto ws = manage_workspace<Tensor>(gemmlowp.workspace (), mg, pack[i], pack[i]);
182
-
183
- gemmlowp.run (pack[i]);
184
- targets[i] =std::move (*(out_ptrs[i]));
185
- });
186
- }
187
- else
188
- #endif // ifndef BARE_METAL
189
- {
190
- auto mg = MemoryGroup{};
191
- auto ws = manage_workspace<Tensor>(gemmlowp.workspace (), mg, pack[i], pack[i]);
192
-
193
- gemmlowp.run (pack[i]);
194
- targets[i] = std::move (*(out_ptrs[i]));
140
+ fill (AccessorType (bias), 5 + finfo.hash , finfo.min_bias , finfo.max_bias );
195
141
}
196
142
}
197
143
198
- #ifndef BARE_METAL
199
- if (num_parallel_runs > 1 )
200
- {
201
- for (int i = 0 ; i < num_parallel_runs; ++i)
202
- {
203
- threads[i].join ();
204
- }
205
- }
206
- #endif // ifndef BARE_METAL
144
+ // Compute GEMM function
145
+ gemmlowp.run (pack);
146
+
147
+ return output;
207
148
}
208
149
} // namespace
209
150
@@ -219,28 +160,23 @@ class CpuGEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGeneric
219
160
220
161
bool accumulate = false ;
221
162
bool dynamic_qinfo = false ;
222
- this ->_num_parallel_runs = 1 ;
223
- compute_target (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
224
- this ->_references [0 ] = this ->compute_reference (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
163
+ this ->_target = compute_target (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo);
164
+ this ->_reference = this ->compute_reference (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate);
225
165
}
226
166
227
167
protected:
228
- void compute_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
168
+ TensorType compute_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo)
229
169
{
230
170
const auto output_qinfo = QuantizationInfo (); // No output stage
231
- compute_cpugemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo (), false , finfo, accumulate, dynamic_qinfo, DataType::UNKNOWN, this -> _num_parallel_runs , this -> _targets );
171
+ return compute_cpugemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo (), false , finfo, accumulate, dynamic_qinfo, DataType::UNKNOWN);
232
172
}
233
-
234
- int _num_parallel_runs{};
235
- TensorType _targets[NUM_THREADS];
236
- SimpleTensor<int32_t > _references[NUM_THREADS];
237
173
};
238
174
239
175
template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false , bool reinterpret_output_as_3d = false , bool run_twice = false >
240
176
class CpuGEMMLowpStaticQuantMatrixMultiplyCoreValidationFixture : protected CpuGEMMLowpMatrixMultiplyCoreValidationFixture <TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, run_twice>
241
177
{
242
178
public:
243
- void setup (TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, DataType data_type, bool is_multithreaded )
179
+ void setup (TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, DataType data_type)
244
180
{
245
181
ARM_COMPUTE_ASSERT (data_type == DataType::QASYMM8_SIGNED || data_type == DataType::QASYMM8);
246
182
const auto a_qinfo = QuantizationInfo (1 .0f / 255 , a_offset);
@@ -249,30 +185,26 @@ class CpuGEMMLowpStaticQuantMatrixMultiplyCoreValidationFixture : protected CpuG
249
185
250
186
bool accumulate = false ;
251
187
bool dynamic_qinfo = true ;
252
- this ->_num_parallel_runs = is_multithreaded ? NUM_THREADS : 1 ;
253
- compute_target (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo, data_type);
254
- compute_reference (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, data_type);
188
+ this ->_target = compute_target (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate, dynamic_qinfo, data_type);
189
+ this ->_reference = compute_reference (shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, data_type);
255
190
}
256
191
257
192
protected:
258
- void compute_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo, const DataType data_type)
193
+ TensorType compute_target (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate, const bool dynamic_qinfo, const DataType data_type)
259
194
{
260
195
const auto output_qinfo = QuantizationInfo (a_qinfo.scale (), a_qinfo.offset ()); // No output stage
261
- compute_cpugemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, GEMMLowpOutputStageInfo (), false , finfo, accumulate, dynamic_qinfo, DataType::UNKNOWN, this -> _num_parallel_runs , this -> _targets );
196
+ return compute_cpugemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, data_type, data_type, GEMMLowpOutputStageInfo (), false , finfo, accumulate, dynamic_qinfo, DataType::UNKNOWN);
262
197
}
263
198
264
- void compute_reference (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const DataType data_type)
199
+ SimpleTensor< int32_t > compute_reference (const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const DataType data_type)
265
200
{
266
- for (int i = 0 ; i < this ->_num_parallel_runs ; ++i)
201
+ if (data_type == DataType::QASYMM8)
202
+ {
203
+ return compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t , uint8_t , false , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, finfo);
204
+ }
205
+ else
267
206
{
268
- if (data_type == DataType::QASYMM8)
269
- {
270
- this ->_references [i] = compute_gemmlowp_reference<reinterpret_input_as_3d, uint8_t , uint8_t , false , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, finfo);
271
- }
272
- else
273
- {
274
- this ->_references [i] = compute_gemmlowp_reference<reinterpret_input_as_3d, int8_t , int8_t , false , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, finfo);
275
- }
207
+ return compute_gemmlowp_reference<reinterpret_input_as_3d, int8_t , int8_t , false , false , run_twice>(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, data_type, data_type, finfo);
276
208
}
277
209
}
278
210
};
0 commit comments