Skip to content

Commit 776723d

Browse files
nioroso-x3tevador
authored andcommitted
POWER7+ VSX support plus AES hardware support for POWER8 and newer. (#41)
1 parent 8ff1bf0 commit 776723d

File tree

2 files changed

+226
-0
lines changed

2 files changed

+226
-0
lines changed

makefile

+8
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ ifeq ($(PLATFORM),x86_64)
2222
CXXFLAGS += -maes
2323
endif
2424

25+
ifeq ($(PLATFORM),ppc64)
26+
CXXFLAGS += -mcpu=native
27+
endif
28+
29+
ifeq ($(PLATFORM),ppc64le)
30+
CXXFLAGS += -mcpu=native
31+
endif
32+
2533
release: CXXFLAGS += -O3 -flto
2634
release: CCFLAGS += -O3 -flto
2735
release: LDFLAGS += -flto

src/intrin_portable.h

+218
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,225 @@ FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
160160
_mm_setcsr(rx_mxcsr_default | (mode << 13));
161161
}
162162

163+
#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD
164+
#include <cstdint>
165+
#include <stdexcept>
166+
#include <cstdlib>
167+
#include<altivec.h>
168+
#undef vector
169+
#undef pixel
170+
#undef bool
171+
172+
typedef __vector uint8_t __m128i;
173+
typedef __vector uint32_t __m128l;
174+
typedef __vector int __m128li;
175+
typedef __vector uint64_t __m128ll;
176+
typedef __vector double __m128d;
177+
178+
typedef __m128i rx_vec_i128;
179+
typedef __m128d rx_vec_f128;
180+
typedef union{
181+
rx_vec_i128 i;
182+
rx_vec_f128 d;
183+
uint64_t u64[2];
184+
double d64[2];
185+
uint32_t u32[4];
186+
int i32[4];
187+
} vec_u;
188+
189+
#define rx_aligned_alloc(a, b) malloc(a)
190+
#define rx_aligned_free(a) free(a)
191+
#define rx_prefetch_nta(x)
192+
193+
194+
/* Splat 64-bit long long to 2 64-bit long longs */
195+
FORCE_INLINE __m128i vec_splat2sd (int64_t scalar)
196+
{ return (__m128i) vec_splats (scalar); }
197+
198+
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
199+
#if defined(NATIVE_LITTLE_ENDIAN)
200+
return (rx_vec_f128)vec_vsx_ld(0,pd);
201+
#else
202+
vec_u t;
203+
t.u64[0] = load64(pd + 0);
204+
t.u64[1] = load64(pd + 1);
205+
return (rx_vec_f128)t.d;
206+
#endif
207+
}
208+
209+
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
210+
#if defined(NATIVE_LITTLE_ENDIAN)
211+
vec_vsx_st(a,0,(rx_vec_f128*)mem_addr);
212+
#else
213+
vec_u _a;
214+
_a.d = a;
215+
store64(mem_addr + 0, _a.u64[0]);
216+
store64(mem_addr + 1, _a.u64[1]);
217+
#endif
218+
}
219+
220+
FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
221+
return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7});
222+
}
223+
224+
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
225+
return (rx_vec_f128)vec_add(a,b);
226+
}
227+
228+
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
229+
return (rx_vec_f128)vec_sub(a,b);
230+
}
231+
232+
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
233+
return (rx_vec_f128)vec_mul(a,b);
234+
}
235+
236+
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
237+
return (rx_vec_f128)vec_div(a,b);
238+
}
239+
240+
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
241+
return (rx_vec_f128)vec_sqrt(a);
242+
}
243+
244+
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
245+
return (rx_vec_i128)vec_splat2sd(a);
246+
}
247+
248+
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
249+
return (rx_vec_f128)a;
250+
}
251+
252+
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
253+
return (rx_vec_f128)(__m128ll){x0,x1};
254+
}
255+
256+
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
257+
return (rx_vec_f128)vec_splat2sd(x);
258+
}
259+
260+
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
261+
return (rx_vec_f128)vec_xor(a,b);
262+
}
263+
264+
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
265+
return (rx_vec_f128)vec_and(a,b);
266+
}
267+
268+
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
269+
return (rx_vec_f128)vec_or(a,b);
270+
}
271+
#if defined(__CRYPTO__)
272+
273+
FORCE_INLINE __m128ll vrev(__m128i v){
274+
#if defined(NATIVE_LITTLE_ENDIAN)
275+
return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0});
163276
#else
277+
return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12});
278+
#endif
279+
}
280+
281+
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
282+
__m128ll _v = vrev(v);
283+
__m128ll _rkey = vrev(rkey);
284+
__m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey));
285+
return (rx_vec_i128)result;
286+
}
287+
288+
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
289+
__m128ll _v = vrev(v);
290+
__m128ll zero = (__m128ll){0};
291+
__m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero));
292+
return (rx_vec_i128)vec_xor((__m128i)out,rkey);
293+
}
294+
#else
295+
static const char* platformError = "Platform doesn't support hardware AES";
296+
297+
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
298+
throw std::runtime_error(platformError);
299+
}
300+
301+
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
302+
throw std::runtime_error(platformError);
303+
}
304+
#endif
305+
306+
307+
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
308+
vec_u _a;
309+
_a.i = a;
310+
return _a.i32[0];
311+
}
312+
313+
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
314+
vec_u _a;
315+
_a.i = a;
316+
return _a.i32[1];
317+
}
318+
319+
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
320+
vec_u _a;
321+
_a.i = a;
322+
return _a.i32[2];
323+
}
324+
325+
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
326+
vec_u _a;
327+
_a.i = a;
328+
return _a.i32[3];
329+
}
330+
331+
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
332+
return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3});
333+
};
334+
335+
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
336+
return (rx_vec_i128)vec_xor(_A,_B);
337+
}
338+
339+
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *_P) {
340+
#if defined(NATIVE_LITTLE_ENDIAN)
341+
return *_P;
342+
#else
343+
uint32_t* ptr = (uint32_t*)_P;
344+
vec_u c;
345+
c.u32[0] = load32(ptr + 0);
346+
c.u32[1] = load32(ptr + 1);
347+
c.u32[2] = load32(ptr + 2);
348+
c.u32[3] = load32(ptr + 3);
349+
return (rx_vec_i128)c.i;
350+
#endif
351+
}
352+
353+
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
354+
#if defined(NATIVE_LITTLE_ENDIAN)
355+
*_P = _B;
356+
#else
357+
uint32_t* ptr = (uint32_t*)_P;
358+
vec_u B;
359+
B.i = _B;
360+
store32(ptr + 0, B.u32[0]);
361+
store32(ptr + 1, B.u32[1]);
362+
store32(ptr + 2, B.u32[2]);
363+
store32(ptr + 3, B.u32[3]);
364+
#endif
365+
}
366+
367+
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
368+
vec_u x;
369+
x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
370+
x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
371+
return (rx_vec_f128)x.d;
372+
}
373+
374+
#define RANDOMX_DEFAULT_FENV
375+
376+
void rx_reset_float_state();
377+
378+
void rx_set_rounding_mode(uint32_t mode);
379+
380+
#else //end altivec
381+
164382
#include <cstdint>
165383
#include <stdexcept>
166384
#include <cstdlib>

0 commit comments

Comments
 (0)