Skip to content

Commit

Permalink
[WebAssembly] Add intrinsics to wasm_simd128.h for all FP16 instructi…
Browse files Browse the repository at this point in the history
…ons (llvm#106465)

Getting this to work required a few additional changes:
- Add builtins for any instructions that can't be done with plain C
currently.
- Add support for the saturating version of fp_to_<s,i>_I16x8. Other
vector sizes supported this already.
- Support bitcast of f16x8 to v128. Needed to return a __f16x8 as
v128_t.
  • Loading branch information
brendandahl authored Aug 30, 2024
1 parent 206b5af commit 5703d85
Show file tree
Hide file tree
Showing 7 changed files with 348 additions and 13 deletions.
9 changes: 9 additions & 0 deletions clang/include/clang/Basic/BuiltinsWebAssembly.def
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "UiV8s", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "UiV4i", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "UiV2LLi", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_abs_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")

Expand All @@ -140,6 +141,10 @@ TARGET_BUILTIN(__builtin_wasm_max_f16x8, "V8hV8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_pmin_f16x8, "V8hV8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_pmax_f16x8, "V8hV8hV8h", "nc", "fp16")

TARGET_BUILTIN(__builtin_wasm_ceil_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_floor_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_trunc_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_nearest_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_trunc_f32x4, "V4fV4f", "nc", "simd128")
Expand All @@ -151,9 +156,13 @@ TARGET_BUILTIN(__builtin_wasm_nearest_f64x2, "V2dV2d", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_sqrt_f16x8, "V8hV8h", "nc", "fp16")
TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i16x8_f16x8, "V8sV8h", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i16x8_f16x8, "V8sV8h", "nc", "simd128")

TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128")
TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128")

Expand Down
12 changes: 12 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21211,6 +21211,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i16x8_f16x8:
case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
Value *Src = EmitScalarExpr(E->getArg(0));
llvm::Type *ResT = ConvertType(E->getType());
Expand All @@ -21222,6 +21223,7 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i16x8_f16x8:
case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
Value *Src = EmitScalarExpr(E->getArg(0));
llvm::Type *ResT = ConvertType(E->getType());
Expand Down Expand Up @@ -21269,6 +21271,10 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
return Builder.CreateCall(Callee, {LHS, RHS});
}
case WebAssembly::BI__builtin_wasm_ceil_f16x8:
case WebAssembly::BI__builtin_wasm_floor_f16x8:
case WebAssembly::BI__builtin_wasm_trunc_f16x8:
case WebAssembly::BI__builtin_wasm_nearest_f16x8:
case WebAssembly::BI__builtin_wasm_ceil_f32x4:
case WebAssembly::BI__builtin_wasm_floor_f32x4:
case WebAssembly::BI__builtin_wasm_trunc_f32x4:
Expand All @@ -21279,18 +21285,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
unsigned IntNo;
switch (BuiltinID) {
case WebAssembly::BI__builtin_wasm_ceil_f16x8:
case WebAssembly::BI__builtin_wasm_ceil_f32x4:
case WebAssembly::BI__builtin_wasm_ceil_f64x2:
IntNo = Intrinsic::ceil;
break;
case WebAssembly::BI__builtin_wasm_floor_f16x8:
case WebAssembly::BI__builtin_wasm_floor_f32x4:
case WebAssembly::BI__builtin_wasm_floor_f64x2:
IntNo = Intrinsic::floor;
break;
case WebAssembly::BI__builtin_wasm_trunc_f16x8:
case WebAssembly::BI__builtin_wasm_trunc_f32x4:
case WebAssembly::BI__builtin_wasm_trunc_f64x2:
IntNo = Intrinsic::trunc;
break;
case WebAssembly::BI__builtin_wasm_nearest_f16x8:
case WebAssembly::BI__builtin_wasm_nearest_f32x4:
case WebAssembly::BI__builtin_wasm_nearest_f64x2:
IntNo = Intrinsic::nearbyint;
Expand Down Expand Up @@ -21489,12 +21499,14 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
return Builder.CreateCall(Callee, {Vec});
}
case WebAssembly::BI__builtin_wasm_abs_f16x8:
case WebAssembly::BI__builtin_wasm_abs_f32x4:
case WebAssembly::BI__builtin_wasm_abs_f64x2: {
Value *Vec = EmitScalarExpr(E->getArg(0));
Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
return Builder.CreateCall(Callee, {Vec});
}
case WebAssembly::BI__builtin_wasm_sqrt_f16x8:
case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
Value *Vec = EmitScalarExpr(E->getArg(0));
Expand Down
147 changes: 147 additions & 0 deletions clang/lib/Headers/wasm_simd128.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ typedef unsigned long long __u64x2
__attribute__((__vector_size__(16), __aligned__(16)));
typedef float __f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
typedef double __f64x2 __attribute__((__vector_size__(16), __aligned__(16)));
typedef __fp16 __f16x8 __attribute__((__vector_size__(16), __aligned__(16)));

typedef signed char __i8x8 __attribute__((__vector_size__(8), __aligned__(8)));
typedef unsigned char __u8x8
Expand Down Expand Up @@ -1878,6 +1879,152 @@ wasm_i32x4_relaxed_dot_i8x16_i7x16_add(v128_t __a, v128_t __b, v128_t __c) {
(__i8x16)__a, (__i8x16)__b, (__i32x4)__c);
}

// FP16 intrinsics
#define __FP16_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("fp16"), \
__min_vector_width__(128)))

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) {
return (v128_t)__builtin_wasm_splat_f16x8(__a);
}

static __inline__ float __FP16_FN_ATTRS wasm_f16x8_extract_lane(v128_t __a,
int __i)
__REQUIRE_CONSTANT(__i) {
return __builtin_wasm_extract_lane_f16x8((__f16x8)__a, __i);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_replace_lane(v128_t __a,
int __i,
float __b)
__REQUIRE_CONSTANT(__i) {
return (v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)__a, __i, __b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) {
return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_neg(v128_t __a) {
return (v128_t)(-(__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sqrt(v128_t __a) {
return (v128_t)__builtin_wasm_sqrt_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ceil(v128_t __a) {
return (v128_t)__builtin_wasm_ceil_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_floor(v128_t __a) {
return (v128_t)__builtin_wasm_floor_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_trunc(v128_t __a) {
return (v128_t)__builtin_wasm_trunc_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_nearest(v128_t __a) {
return (v128_t)__builtin_wasm_nearest_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_eq(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a == (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ne(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a != (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_lt(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a < (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_gt(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a > (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_le(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a <= (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_ge(v128_t __a, v128_t __b) {
return (v128_t)((__f16x8)__a >= (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_add(v128_t __a,
v128_t __b) {
return (v128_t)((__f16x8)__a + (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_sub(v128_t __a,
v128_t __b) {
return (v128_t)((__f16x8)__a - (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_mul(v128_t __a,
v128_t __b) {
return (v128_t)((__f16x8)__a * (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_div(v128_t __a,
v128_t __b) {
return (v128_t)((__f16x8)__a / (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_min(v128_t __a,
v128_t __b) {
return (v128_t)__builtin_wasm_min_f16x8((__f16x8)__a, (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_max(v128_t __a,
v128_t __b) {
return (v128_t)__builtin_wasm_max_f16x8((__f16x8)__a, (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmin(v128_t __a,
v128_t __b) {
return (v128_t)__builtin_wasm_pmin_f16x8((__f16x8)__a, (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_pmax(v128_t __a,
v128_t __b) {
return (v128_t)__builtin_wasm_pmax_f16x8((__f16x8)__a, (__f16x8)__b);
}

static __inline__ v128_t __FP16_FN_ATTRS
wasm_i16x8_trunc_sat_f16x8(v128_t __a) {
return (v128_t)__builtin_wasm_trunc_saturate_s_i16x8_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS
wasm_u16x8_trunc_sat_f16x8(v128_t __a) {
return (v128_t)__builtin_wasm_trunc_saturate_u_i16x8_f16x8((__f16x8)__a);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_i16x8(v128_t __a) {
return (v128_t) __builtin_convertvector((__i16x8)__a, __f16x8);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) {
return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a,
v128_t __b,
v128_t __c) {
return (v128_t)__builtin_wasm_relaxed_madd_f16x8((__f16x8)__a, (__f16x8)__b,
(__f16x8)__c);
}

static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_nmadd(v128_t __a,
v128_t __b,
v128_t __c) {
return (v128_t)__builtin_wasm_relaxed_nmadd_f16x8((__f16x8)__a, (__f16x8)__b,
(__f16x8)__c);
}

// Deprecated intrinsics

static __inline__ v128_t __DEPRECATED_FN_ATTRS("wasm_i8x16_swizzle")
Expand Down
Loading

0 comments on commit 5703d85

Please sign in to comment.