@@ -160,7 +160,225 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
160
160
_mm_setcsr (rx_mxcsr_default | (mode << 13 ));
161
161
}
162
162
163
+ #elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) // sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
164
+ #include < cstdint>
165
+ #include < stdexcept>
166
+ #include < cstdlib>
167
+ #include < altivec.h>
168
+ #undef vector
169
+ #undef pixel
170
+ #undef bool
171
+
172
+ typedef __vector uint8_t __m128i;
173
+ typedef __vector uint32_t __m128l;
174
+ typedef __vector int __m128li;
175
+ typedef __vector uint64_t __m128ll;
176
+ typedef __vector double __m128d;
177
+
178
+ typedef __m128i rx_vec_i128;
179
+ typedef __m128d rx_vec_f128;
180
+ typedef union {
181
+ rx_vec_i128 i;
182
+ rx_vec_f128 d;
183
+ uint64_t u64[2 ];
184
+ double d64[2 ];
185
+ uint32_t u32[4 ];
186
+ int i32[4 ];
187
+ } vec_u;
188
+
189
+ #define rx_aligned_alloc (a, b ) malloc(a)
190
+ #define rx_aligned_free (a ) free(a)
191
+ #define rx_prefetch_nta (x )
192
+
193
+
194
+ /* Splat 64-bit long long to 2 64-bit long longs */
195
+ FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
196
+ { return (__m128i) vec_splats (scalar); }
197
+
198
+ FORCE_INLINE rx_vec_f128 rx_load_vec_f128 (const double * pd) {
199
+ #if defined(NATIVE_LITTLE_ENDIAN)
200
+ return (rx_vec_f128)vec_vsx_ld (0 ,pd);
201
+ #else
202
+ vec_u t;
203
+ t.u64 [0 ] = load64 (pd + 0 );
204
+ t.u64 [1 ] = load64 (pd + 1 );
205
+ return (rx_vec_f128)t.d ;
206
+ #endif
207
+ }
208
+
209
+ FORCE_INLINE void rx_store_vec_f128 (double * mem_addr, rx_vec_f128 a) {
210
+ #if defined(NATIVE_LITTLE_ENDIAN)
211
+ vec_vsx_st (a,0 ,(rx_vec_f128*)mem_addr);
212
+ #else
213
+ vec_u _a;
214
+ _a.d = a;
215
+ store64 (mem_addr + 0 , _a.u64 [0 ]);
216
+ store64 (mem_addr + 1 , _a.u64 [1 ]);
217
+ #endif
218
+ }
219
+
220
+ FORCE_INLINE rx_vec_f128 rx_swap_vec_f128 (rx_vec_f128 a) {
221
+ return (rx_vec_f128)vec_perm ((__m128i)a,(__m128i)a,(__m128i){8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 });
222
+ }
223
+
224
+ FORCE_INLINE rx_vec_f128 rx_add_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
225
+ return (rx_vec_f128)vec_add (a,b);
226
+ }
227
+
228
+ FORCE_INLINE rx_vec_f128 rx_sub_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
229
+ return (rx_vec_f128)vec_sub (a,b);
230
+ }
231
+
232
+ FORCE_INLINE rx_vec_f128 rx_mul_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
233
+ return (rx_vec_f128)vec_mul (a,b);
234
+ }
235
+
236
+ FORCE_INLINE rx_vec_f128 rx_div_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
237
+ return (rx_vec_f128)vec_div (a,b);
238
+ }
239
+
240
+ FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128 (rx_vec_f128 a) {
241
+ return (rx_vec_f128)vec_sqrt (a);
242
+ }
243
+
244
+ FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128 (uint64_t a) {
245
+ return (rx_vec_i128)vec_splat2sd (a);
246
+ }
247
+
248
+ FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128 (rx_vec_i128 a) {
249
+ return (rx_vec_f128)a;
250
+ }
251
+
252
+ FORCE_INLINE rx_vec_f128 rx_set_vec_f128 (uint64_t x1, uint64_t x0) {
253
+ return (rx_vec_f128)(__m128ll){x0,x1};
254
+ }
255
+
256
+ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128 (uint64_t x) {
257
+ return (rx_vec_f128)vec_splat2sd (x);
258
+ }
259
+
260
+ FORCE_INLINE rx_vec_f128 rx_xor_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
261
+ return (rx_vec_f128)vec_xor (a,b);
262
+ }
263
+
264
+ FORCE_INLINE rx_vec_f128 rx_and_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
265
+ return (rx_vec_f128)vec_and (a,b);
266
+ }
267
+
268
+ FORCE_INLINE rx_vec_f128 rx_or_vec_f128 (rx_vec_f128 a, rx_vec_f128 b) {
269
+ return (rx_vec_f128)vec_or (a,b);
270
+ }
271
+ #if defined(__CRYPTO__)
272
+
273
+ FORCE_INLINE __m128ll vrev (__m128i v){
274
+ #if defined(NATIVE_LITTLE_ENDIAN)
275
+ return (__m128ll)vec_perm ((__m128i)v,(__m128i){0 },(__m128i){15 ,14 ,13 ,12 ,11 ,10 ,9 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 });
163
276
#else
277
+ return (__m128ll)vec_perm ((__m128i)v,(__m128i){0 },(__m128i){3 ,2 ,1 ,0 , 7 ,6 ,5 ,4 , 11 ,10 ,9 ,8 , 15 ,14 ,13 ,12 });
278
+ #endif
279
+ }
280
+
281
+ FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128 (rx_vec_i128 v, rx_vec_i128 rkey) {
282
+ __m128ll _v = vrev (v);
283
+ __m128ll _rkey = vrev (rkey);
284
+ __m128ll result = vrev ((__m128i)__builtin_crypto_vcipher (_v,_rkey));
285
+ return (rx_vec_i128)result;
286
+ }
287
+
288
+ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128 (rx_vec_i128 v, rx_vec_i128 rkey) {
289
+ __m128ll _v = vrev (v);
290
+ __m128ll zero = (__m128ll){0 };
291
+ __m128ll out = vrev ((__m128i)__builtin_crypto_vncipher (_v,zero));
292
+ return (rx_vec_i128)vec_xor ((__m128i)out,rkey);
293
+ }
294
+ #else
295
+ static const char * platformError = " Platform doesn't support hardware AES" ;
296
+
297
+ FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128 (rx_vec_i128 v, rx_vec_i128 rkey) {
298
+ throw std::runtime_error (platformError);
299
+ }
300
+
301
+ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128 (rx_vec_i128 v, rx_vec_i128 rkey) {
302
+ throw std::runtime_error (platformError);
303
+ }
304
+ #endif
305
+
306
+
307
+ FORCE_INLINE int rx_vec_i128_x (rx_vec_i128 a) {
308
+ vec_u _a;
309
+ _a.i = a;
310
+ return _a.i32 [0 ];
311
+ }
312
+
313
+ FORCE_INLINE int rx_vec_i128_y (rx_vec_i128 a) {
314
+ vec_u _a;
315
+ _a.i = a;
316
+ return _a.i32 [1 ];
317
+ }
318
+
319
+ FORCE_INLINE int rx_vec_i128_z (rx_vec_i128 a) {
320
+ vec_u _a;
321
+ _a.i = a;
322
+ return _a.i32 [2 ];
323
+ }
324
+
325
+ FORCE_INLINE int rx_vec_i128_w (rx_vec_i128 a) {
326
+ vec_u _a;
327
+ _a.i = a;
328
+ return _a.i32 [3 ];
329
+ }
330
+
331
+ FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128 (int _I3, int _I2, int _I1, int _I0) {
332
+ return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3});
333
+ };
334
+
335
+ FORCE_INLINE rx_vec_i128 rx_xor_vec_i128 (rx_vec_i128 _A, rx_vec_i128 _B) {
336
+ return (rx_vec_i128)vec_xor (_A,_B);
337
+ }
338
+
339
+ FORCE_INLINE rx_vec_i128 rx_load_vec_i128 (rx_vec_i128 const *_P) {
340
+ #if defined(NATIVE_LITTLE_ENDIAN)
341
+ return *_P;
342
+ #else
343
+ uint32_t * ptr = (uint32_t *)_P;
344
+ vec_u c;
345
+ c.u32 [0 ] = load32 (ptr + 0 );
346
+ c.u32 [1 ] = load32 (ptr + 1 );
347
+ c.u32 [2 ] = load32 (ptr + 2 );
348
+ c.u32 [3 ] = load32 (ptr + 3 );
349
+ return (rx_vec_i128)c.i ;
350
+ #endif
351
+ }
352
+
353
+ FORCE_INLINE void rx_store_vec_i128 (rx_vec_i128 *_P, rx_vec_i128 _B) {
354
+ #if defined(NATIVE_LITTLE_ENDIAN)
355
+ *_P = _B;
356
+ #else
357
+ uint32_t * ptr = (uint32_t *)_P;
358
+ vec_u B;
359
+ B.i = _B;
360
+ store32 (ptr + 0 , B.u32 [0 ]);
361
+ store32 (ptr + 1 , B.u32 [1 ]);
362
+ store32 (ptr + 2 , B.u32 [2 ]);
363
+ store32 (ptr + 3 , B.u32 [3 ]);
364
+ #endif
365
+ }
366
+
367
+ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128 (const void * addr) {
368
+ vec_u x;
369
+ x.d64 [0 ] = (double )unsigned32ToSigned2sCompl (load32 ((uint8_t *)addr + 0 ));
370
+ x.d64 [1 ] = (double )unsigned32ToSigned2sCompl (load32 ((uint8_t *)addr + 4 ));
371
+ return (rx_vec_f128)x.d ;
372
+ }
373
+
374
+ #define RANDOMX_DEFAULT_FENV
375
+
376
+ void rx_reset_float_state ();
377
+
378
+ void rx_set_rounding_mode (uint32_t mode);
379
+
380
+ #else // end altivec
381
+
164
382
#include < cstdint>
165
383
#include < stdexcept>
166
384
#include < cstdlib>
0 commit comments