From c89624555f75f170000ffa1185a4c0b83b3527a4 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Thu, 24 Apr 2025 12:27:47 +0900 Subject: [PATCH 01/10] Added zend_simd.h --- Zend/zend_simd.h | 106 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 Zend/zend_simd.h diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h new file mode 100644 index 0000000000000..359b7f36acbc9 --- /dev/null +++ b/Zend/zend_simd.h @@ -0,0 +1,106 @@ +/* + +----------------------------------------------------------------------+ + | Zend Engine | + +----------------------------------------------------------------------+ + | Copyright (c) Zend Technologies Ltd. (http://www.zend.com) | + +----------------------------------------------------------------------+ + | This source file is subject to version 2.00 of the Zend license, | + | that is bundled with this package in the file LICENSE, and is | + | available through the world-wide-web at the following url: | + | http://www.zend.com/license/2_00.txt. | + | If you did not receive a copy of the Zend license and are unable to | + | obtain it through the world-wide-web, please send a note to | + | license@zend.com so we can mail you a copy immediately. | + +----------------------------------------------------------------------+ + | Authors: Saki Takamachi | + +----------------------------------------------------------------------+ +*/ + +#ifndef ZEND_SIMD_H +#define ZEND_SIMD_H + +#ifdef __SSE2__ +#include +#define ZEND_HAVE_VECTOR_128 + +typedef __m128i zend_vec_8x16_t; +typedef __m128i zend_vec_16x8_t; +typedef __m128i zend_vec_32x4_t; +typedef __m128i zend_vec_64x2_t; + +#define zend_vec_setzero_8x16() _mm_setzero_si128() +#define zend_vec_set_8x16(x) _mm_set1_epi8(x) +#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) +#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3) +#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1) +#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x)) +#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x)) +#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x) +#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x) + +#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b) +#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b) +#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b) +#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes) +#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes) + +#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b) + +#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) +#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) +#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b) + +#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x) + + +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#define ZEND_HAVE_VECTOR_128 + +typedef int8x16_t zend_vec_8x16_t; +typedef int16x8_t zend_vec_16x8_t; +typedef int32x4_t zend_vec_32x4_t; +typedef int64x2_t zend_vec_64x2_t; + +#define zend_vec_setzero_8x16() vdupq_n_s8(0) +#define zend_vec_set_8x16(x) vdupq_n_s8(x) +#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) \ + vreinterpretq_s8_s16((int16x8_t) { \ + (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \ + (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) }) +#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) \ + vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) }) +#define zend_vec_set_8x16_from_64x2(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) +#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x)) +#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x) +#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x) +#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x) + +#define zend_vec_or_8x16(a, b) vorrq_s8(a, b) +#define zend_vec_xor_8x16(a, b) veorq_s8(a, b) +#define zend_vec_and_8x16(a, b) vandq_s8(a, b) +#define zend_vec_rshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) +#define zend_vec_lshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) + +#define zend_vec_add_8x16(a, b) vaddq_s8(a, b) + +#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) +#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) +#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) + +static zend_always_inline int zend_vec_movemask_8x16(int8x16_t x) +{ + /** + * based on code from + * https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + */ + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(x), 7)); + uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +#endif + +#endif /* ZEND_SIMD_H */ From 178000fc869929dedeb180d464e3ef7adb5429b3 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Thu, 24 Apr 2025 12:28:09 +0900 Subject: [PATCH 02/10] use zend_simd in url.c --- ext/standard/url.c | 77 ++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/ext/standard/url.c b/ext/standard/url.c index da2ddea067314..82f00d1223dac 100644 --- a/ext/standard/url.c +++ b/ext/standard/url.c @@ -19,14 +19,11 @@ #include #include -#ifdef __SSE2__ -#include -#endif - #include "php.h" #include "url.h" #include "file.h" +#include "zend_simd.h" /* {{{ free_url */ PHPAPI void php_url_free(php_url *theurl) @@ -460,53 +457,53 @@ static zend_always_inline zend_string *php_url_encode_impl(const char *s, size_t start = zend_string_safe_alloc(3, len, 0, 0); to = (unsigned char*)ZSTR_VAL(start); -#ifdef __SSE2__ +#ifdef ZEND_HAVE_VECTOR_128 while (from + 16 < end) { - __m128i mask; + zend_vec_8x16_t mask; uint32_t bits; - const __m128i _A = _mm_set1_epi8('A' - 1); - const __m128i Z_ = _mm_set1_epi8('Z' + 1); - const __m128i _a = _mm_set1_epi8('a' - 1); - const __m128i z_ = _mm_set1_epi8('z' + 1); - const __m128i _zero = _mm_set1_epi8('0' - 1); - const __m128i nine_ = _mm_set1_epi8('9' + 1); - const __m128i dot = _mm_set1_epi8('.'); - const __m128i minus = _mm_set1_epi8('-'); - const __m128i under = _mm_set1_epi8('_'); - - __m128i in = _mm_loadu_si128((__m128i *)from); - - __m128i gt = _mm_cmpgt_epi8(in, _A); - __m128i lt = _mm_cmplt_epi8(in, Z_); - mask = _mm_and_si128(lt, gt); /* upper */ - gt = _mm_cmpgt_epi8(in, _a); - lt = _mm_cmplt_epi8(in, z_); - mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* lower */ - gt = _mm_cmpgt_epi8(in, _zero); - lt = _mm_cmplt_epi8(in, nine_); - mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* number */ - mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, dot)); - mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, minus)); - mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, under)); + const zend_vec_8x16_t _A = zend_vec_set_8x16('A' - 1); + const zend_vec_8x16_t Z_ = zend_vec_set_8x16('Z' + 1); + const zend_vec_8x16_t _a = zend_vec_set_8x16('a' - 1); + const zend_vec_8x16_t z_ = zend_vec_set_8x16('z' + 1); + const zend_vec_8x16_t _zero = zend_vec_set_8x16('0' - 1); + const zend_vec_8x16_t nine_ = zend_vec_set_8x16('9' + 1); + const zend_vec_8x16_t dot = zend_vec_set_8x16('.'); + const zend_vec_8x16_t minus = zend_vec_set_8x16('-'); + const zend_vec_8x16_t under = zend_vec_set_8x16('_'); + + zend_vec_8x16_t in = zend_vec_loadu_8x16(from); + + zend_vec_8x16_t gt = zend_vec_cmpgt_8x16(in, _A); + zend_vec_8x16_t lt = zend_vec_cmplt_8x16(in, Z_); + mask = zend_vec_and_8x16(lt, gt); /* upper */ + gt = zend_vec_cmpgt_8x16(in, _a); + lt = zend_vec_cmplt_8x16(in, z_); + mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* lower */ + gt = zend_vec_cmpgt_8x16(in, _zero); + lt = zend_vec_cmplt_8x16(in, nine_); + mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* number */ + mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, dot)); + mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, minus)); + mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, under)); if (!raw) { - const __m128i blank = _mm_set1_epi8(' '); - __m128i eq = _mm_cmpeq_epi8(in, blank); - if (_mm_movemask_epi8(eq)) { - in = _mm_add_epi8(in, _mm_and_si128(eq, _mm_set1_epi8('+' - ' '))); - mask = _mm_or_si128(mask, eq); + const zend_vec_8x16_t blank = zend_vec_set_8x16(' '); + zend_vec_8x16_t eq = zend_vec_cmpeq_8x16(in, blank); + if (zend_vec_movemask_8x16(eq)) { + in = zend_vec_add_8x16(in, zend_vec_and_8x16(eq, zend_vec_set_8x16('+' - ' '))); + mask = zend_vec_or_8x16(mask, eq); } } if (raw) { - const __m128i wavy = _mm_set1_epi8('~'); - mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, wavy)); + const zend_vec_8x16_t wavy = zend_vec_set_8x16('~'); + mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, wavy)); } - if (((bits = _mm_movemask_epi8(mask)) & 0xffff) == 0xffff) { - _mm_storeu_si128((__m128i*)to, in); + if (((bits = zend_vec_movemask_8x16(mask)) & 0xffff) == 0xffff) { + zend_vec_storeu_8x16(to, in); to += 16; } else { unsigned char xmm[16]; - _mm_storeu_si128((__m128i*)xmm, in); + zend_vec_storeu_8x16(xmm, in); for (size_t i = 0; i < sizeof(xmm); i++) { if ((bits & (0x1 << i))) { *to++ = xmm[i]; From 37f66bc0bb6089e6b67f703c2aa3eb8a3fe2d529 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Thu, 24 Apr 2025 12:28:39 +0900 Subject: [PATCH 03/10] use zend_simd.h in ZendAccelerator.c --- ext/opcache/ZendAccelerator.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ext/opcache/ZendAccelerator.c b/ext/opcache/ZendAccelerator.c index 704846c4a860f..7a4cce9739eb1 100644 --- a/ext/opcache/ZendAccelerator.c +++ b/ext/opcache/ZendAccelerator.c @@ -98,6 +98,8 @@ typedef int gid_t; #include #endif +#include "zend_simd.h" + ZEND_EXTENSION(); #ifndef ZTS @@ -171,16 +173,16 @@ static void bzero_aligned(void *mem, size_t size) _mm256_store_si256((__m256i*)(p+32), ymm0); p += 64; } -#elif defined(__SSE2__) +#elif defined(ZEND_HAVE_VECTOR_128) char *p = (char*)mem; char *end = p + size; - __m128i xmm0 = _mm_setzero_si128(); + zend_vec_8x16_t xmm0 = zend_vec_setzero_8x16(); while (p < end) { - _mm_store_si128((__m128i*)p, xmm0); - _mm_store_si128((__m128i*)(p+16), xmm0); - _mm_store_si128((__m128i*)(p+32), xmm0); - _mm_store_si128((__m128i*)(p+48), xmm0); + zend_vec_store_8x16(p, xmm0); + zend_vec_store_8x16((p+16), xmm0); + zend_vec_store_8x16((p+32), xmm0); + zend_vec_store_8x16((p+48), xmm0); p += 64; } #else From 004123aeff80961961c48a747459ac1ec118fe58 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Mon, 28 Apr 2025 20:50:15 +0900 Subject: [PATCH 04/10] Use zend_simd.h as a wrapper for neon --- Zend/zend_simd.h | 72 ++++++++++------------------------- ext/opcache/ZendAccelerator.c | 10 ++--- ext/standard/url.c | 70 +++++++++++++++++----------------- 3 files changed, 60 insertions(+), 92 deletions(-) diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h index 359b7f36acbc9..71ec02a52a064 100644 --- a/Zend/zend_simd.h +++ b/Zend/zend_simd.h @@ -23,72 +23,40 @@ #include #define ZEND_HAVE_VECTOR_128 -typedef __m128i zend_vec_8x16_t; -typedef __m128i zend_vec_16x8_t; -typedef __m128i zend_vec_32x4_t; -typedef __m128i zend_vec_64x2_t; - -#define zend_vec_setzero_8x16() _mm_setzero_si128() -#define zend_vec_set_8x16(x) _mm_set1_epi8(x) -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3) -#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1) -#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x)) -#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x)) -#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x) -#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x) - -#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b) -#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b) -#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b) -#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes) -#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes) - -#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b) - -#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) -#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) -#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b) - -#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x) - #elif defined(__aarch64__) || defined(_M_ARM64) #include #define ZEND_HAVE_VECTOR_128 -typedef int8x16_t zend_vec_8x16_t; -typedef int16x8_t zend_vec_16x8_t; -typedef int32x4_t zend_vec_32x4_t; -typedef int64x2_t zend_vec_64x2_t; +typedef int8x16_t __m128i; -#define zend_vec_setzero_8x16() vdupq_n_s8(0) -#define zend_vec_set_8x16(x) vdupq_n_s8(x) -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) \ +#define _mm_setzero_si128() vdupq_n_s8(0) +#define _mm_set1_epi8(x) vdupq_n_s8(x) +#define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \ vreinterpretq_s8_s16((int16x8_t) { \ (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \ (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) }) -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) \ +#define _mm_set_epi32(x0, x1, x2, x3) \ vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) }) -#define zend_vec_set_8x16_from_64x2(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) -#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x)) -#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x) -#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x) -#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x) +#define _mm_set_epi64(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) +#define _mm_load_si128(x) vld1q_s8((const int8_t *) (x)) +#define _mm_loadu_si128(x) _mm_load_si128(x) +#define _mm_store_si128(to, x) vst1q_s8((int8_t *) (to), x) +#define _mm_storeu_si128(to, x) _mm_store_si128(to, x) -#define zend_vec_or_8x16(a, b) vorrq_s8(a, b) -#define zend_vec_xor_8x16(a, b) veorq_s8(a, b) -#define zend_vec_and_8x16(a, b) vandq_s8(a, b) -#define zend_vec_rshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) -#define zend_vec_lshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) +#define _mm_or_si128(a, b) vorrq_s8(a, b) +#define _mm_xor_si128(a, b) veorq_s8(a, b) +#define _mm_and_si128(a, b) vandq_s8(a, b) +#define _mm_srli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) +#define _mm_slli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) -#define zend_vec_add_8x16(a, b) vaddq_s8(a, b) +#define _mm_add_epi8(a, b) vaddq_s8(a, b) -#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) -#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) -#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) +#define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) +#define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) +#define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) -static zend_always_inline int zend_vec_movemask_8x16(int8x16_t x) +static zend_always_inline int _mm_movemask_epi8(int8x16_t x) { /** * based on code from diff --git a/ext/opcache/ZendAccelerator.c b/ext/opcache/ZendAccelerator.c index 7a4cce9739eb1..bffb72fb59c81 100644 --- a/ext/opcache/ZendAccelerator.c +++ b/ext/opcache/ZendAccelerator.c @@ -176,13 +176,13 @@ static void bzero_aligned(void *mem, size_t size) #elif defined(ZEND_HAVE_VECTOR_128) char *p = (char*)mem; char *end = p + size; - zend_vec_8x16_t xmm0 = zend_vec_setzero_8x16(); + __m128i xmm0 = _mm_setzero_si128(); while (p < end) { - zend_vec_store_8x16(p, xmm0); - zend_vec_store_8x16((p+16), xmm0); - zend_vec_store_8x16((p+32), xmm0); - zend_vec_store_8x16((p+48), xmm0); + _mm_store_si128((__m128i*)p, xmm0); + _mm_store_si128((__m128i*)(p+16), xmm0); + _mm_store_si128((__m128i*)(p+32), xmm0); + _mm_store_si128((__m128i*)(p+48), xmm0); p += 64; } #else diff --git a/ext/standard/url.c b/ext/standard/url.c index 82f00d1223dac..e5d8a58966db9 100644 --- a/ext/standard/url.c +++ b/ext/standard/url.c @@ -459,51 +459,51 @@ static zend_always_inline zend_string *php_url_encode_impl(const char *s, size_t #ifdef ZEND_HAVE_VECTOR_128 while (from + 16 < end) { - zend_vec_8x16_t mask; + __m128i mask; uint32_t bits; - const zend_vec_8x16_t _A = zend_vec_set_8x16('A' - 1); - const zend_vec_8x16_t Z_ = zend_vec_set_8x16('Z' + 1); - const zend_vec_8x16_t _a = zend_vec_set_8x16('a' - 1); - const zend_vec_8x16_t z_ = zend_vec_set_8x16('z' + 1); - const zend_vec_8x16_t _zero = zend_vec_set_8x16('0' - 1); - const zend_vec_8x16_t nine_ = zend_vec_set_8x16('9' + 1); - const zend_vec_8x16_t dot = zend_vec_set_8x16('.'); - const zend_vec_8x16_t minus = zend_vec_set_8x16('-'); - const zend_vec_8x16_t under = zend_vec_set_8x16('_'); - - zend_vec_8x16_t in = zend_vec_loadu_8x16(from); - - zend_vec_8x16_t gt = zend_vec_cmpgt_8x16(in, _A); - zend_vec_8x16_t lt = zend_vec_cmplt_8x16(in, Z_); - mask = zend_vec_and_8x16(lt, gt); /* upper */ - gt = zend_vec_cmpgt_8x16(in, _a); - lt = zend_vec_cmplt_8x16(in, z_); - mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* lower */ - gt = zend_vec_cmpgt_8x16(in, _zero); - lt = zend_vec_cmplt_8x16(in, nine_); - mask = zend_vec_or_8x16(mask, zend_vec_and_8x16(lt, gt)); /* number */ - mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, dot)); - mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, minus)); - mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, under)); + const __m128i _A = _mm_set1_epi8('A' - 1); + const __m128i Z_ = _mm_set1_epi8('Z' + 1); + const __m128i _a = _mm_set1_epi8('a' - 1); + const __m128i z_ = _mm_set1_epi8('z' + 1); + const __m128i _zero = _mm_set1_epi8('0' - 1); + const __m128i nine_ = _mm_set1_epi8('9' + 1); + const __m128i dot = _mm_set1_epi8('.'); + const __m128i minus = _mm_set1_epi8('-'); + const __m128i under = _mm_set1_epi8('_'); + + __m128i in = _mm_loadu_si128((__m128i *)from); + + __m128i gt = _mm_cmpgt_epi8(in, _A); + __m128i lt = _mm_cmplt_epi8(in, Z_); + mask = _mm_and_si128(lt, gt); /* upper */ + gt = _mm_cmpgt_epi8(in, _a); + lt = _mm_cmplt_epi8(in, z_); + mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* lower */ + gt = _mm_cmpgt_epi8(in, _zero); + lt = _mm_cmplt_epi8(in, nine_); + mask = _mm_or_si128(mask, _mm_and_si128(lt, gt)); /* number */ + mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, dot)); + mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, minus)); + mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, under)); if (!raw) { - const zend_vec_8x16_t blank = zend_vec_set_8x16(' '); - zend_vec_8x16_t eq = zend_vec_cmpeq_8x16(in, blank); - if (zend_vec_movemask_8x16(eq)) { - in = zend_vec_add_8x16(in, zend_vec_and_8x16(eq, zend_vec_set_8x16('+' - ' '))); - mask = zend_vec_or_8x16(mask, eq); + const __m128i blank = _mm_set1_epi8(' '); + __m128i eq = _mm_cmpeq_epi8(in, blank); + if (_mm_movemask_epi8(eq)) { + in = _mm_add_epi8(in, _mm_and_si128(eq, _mm_set1_epi8('+' - ' '))); + mask = _mm_or_si128(mask, eq); } } if (raw) { - const zend_vec_8x16_t wavy = zend_vec_set_8x16('~'); - mask = zend_vec_or_8x16(mask, zend_vec_cmpeq_8x16(in, wavy)); + const __m128i wavy = _mm_set1_epi8('~'); + mask = _mm_or_si128(mask, _mm_cmpeq_epi8(in, wavy)); } - if (((bits = zend_vec_movemask_8x16(mask)) & 0xffff) == 0xffff) { - zend_vec_storeu_8x16(to, in); + if (((bits = _mm_movemask_epi8(mask)) & 0xffff) == 0xffff) { + _mm_storeu_si128((__m128i*)to, in); to += 16; } else { unsigned char xmm[16]; - zend_vec_storeu_8x16(xmm, in); + _mm_storeu_si128((__m128i*)xmm, in); for (size_t i = 0; i < sizeof(xmm); i++) { if ((bits & (0x1 << i))) { *to++ = xmm[i]; From 59efacfa12255cc7fab48a6afe8567f6ba3c1006 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Mon, 28 Apr 2025 20:56:23 +0900 Subject: [PATCH 05/10] use zend_simd.h in string.c --- ext/standard/string.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ext/standard/string.c b/ext/standard/string.c index 1e20791eb61ce..7a282442659e4 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -46,10 +46,11 @@ #include "ext/random/php_random.h" #ifdef __SSE2__ -#include #include "Zend/zend_bitset.h" #endif +#include "zend_simd.h" + /* this is read-only, so it's ok */ ZEND_SET_ALIGNED(16, static const char hexconvtab[]) = "0123456789abcdef"; @@ -2817,7 +2818,7 @@ static zend_string *php_strtr_ex(zend_string *str, const char *str_from, const c char *input = ZSTR_VAL(str); size_t len = ZSTR_LEN(str); -#ifdef __SSE2__ +#ifdef ZEND_HAVE_VECTOR_128 if (ZSTR_LEN(str) >= sizeof(__m128i)) { __m128i search = _mm_set1_epi8(ch_from); __m128i delta = _mm_set1_epi8(ch_to - ch_from); @@ -3037,7 +3038,7 @@ static zend_always_inline zend_long count_chars(const char *p, zend_long length, zend_long count = 0; const char *endp; -#ifdef __SSE2__ +#ifdef ZEND_HAVE_VECTOR_128 if (length >= sizeof(__m128i)) { __m128i search = _mm_set1_epi8(ch); @@ -5835,7 +5836,7 @@ static zend_string *php_str_rot13(zend_string *str) e = p + ZSTR_LEN(str); target = ZSTR_VAL(ret); -#ifdef __SSE2__ +#ifdef ZEND_HAVE_VECTOR_128 if (e - p > 15) { const __m128i a_minus_1 = _mm_set1_epi8('a' - 1); const __m128i m_plus_1 = _mm_set1_epi8('m' + 1); From f142b6e6032c08b8fb8102fff8c7710390e68dbe Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Mon, 28 Apr 2025 21:05:26 +0900 Subject: [PATCH 06/10] use zend_simd.h in bcmath --- ext/bcmath/libbcmath/src/convert.c | 20 +++++----- ext/bcmath/libbcmath/src/simd.h | 59 ------------------------------ ext/bcmath/libbcmath/src/str2num.c | 36 +++++++++--------- 3 files changed, 28 insertions(+), 87 deletions(-) delete mode 100644 ext/bcmath/libbcmath/src/simd.h diff --git a/ext/bcmath/libbcmath/src/convert.c b/ext/bcmath/libbcmath/src/convert.c index 5438b4c1c44e5..f57f33fda043f 100644 --- a/ext/bcmath/libbcmath/src/convert.c +++ b/ext/bcmath/libbcmath/src/convert.c @@ -17,22 +17,22 @@ #include "bcmath.h" #include "convert.h" #include "private.h" -#include "simd.h" +#include "zend_simd.h" char *bc_copy_and_toggle_bcd(char *restrict dest, const char *source, const char *source_end) { const size_t bulk_shift = SWAR_REPEAT('0'); -#ifdef HAVE_BC_SIMD_128 +#ifdef ZEND_HAVE_VECTOR_128 /* SIMD SSE2 or NEON bulk shift + copy */ - bc_simd_128_t shift_vector = bc_simd_set_8x16('0'); - while (source + sizeof(bc_simd_128_t) <= source_end) { - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) source); - bytes = bc_simd_xor_8x16(bytes, shift_vector); - bc_simd_store_8x16((bc_simd_128_t *) dest, bytes); - - source += sizeof(bc_simd_128_t); - dest += sizeof(bc_simd_128_t); + __m128i shift_vector = _mm_set1_epi8('0'); + while (source + sizeof(__m128i) <= source_end) { + __m128i bytes = _mm_loadu_si128((const __m128i *) source); + bytes = _mm_xor_si128(bytes, shift_vector); + _mm_storeu_si128((__m128i *) dest, bytes); + + source += sizeof(__m128i); + dest += sizeof(__m128i); } #endif diff --git a/ext/bcmath/libbcmath/src/simd.h b/ext/bcmath/libbcmath/src/simd.h deleted file mode 100644 index af38f8349618c..0000000000000 --- a/ext/bcmath/libbcmath/src/simd.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - +----------------------------------------------------------------------+ - | Copyright (c) The PHP Group | - +----------------------------------------------------------------------+ - | This source file is subject to version 3.01 of the PHP license, | - | that is bundled with this package in the file LICENSE, and is | - | available through the world-wide-web at the following url: | - | https://www.php.net/license/3_01.txt | - | If you did not receive a copy of the PHP license and are unable to | - | obtain it through the world-wide-web, please send a note to | - | license@php.net so we can mail you a copy immediately. | - +----------------------------------------------------------------------+ - | Authors: Saki Takamachi | - +----------------------------------------------------------------------+ -*/ - - -#ifndef _BCMATH_SIMD_H_ -#define _BCMATH_SIMD_H_ - -#ifdef __SSE2__ -# include - typedef __m128i bc_simd_128_t; -# define HAVE_BC_SIMD_128 -# define bc_simd_set_8x16(x) _mm_set1_epi8(x) -# define bc_simd_load_8x16(ptr) _mm_loadu_si128((const __m128i *) (ptr)) -# define bc_simd_xor_8x16(a, b) _mm_xor_si128(a, b) -# define bc_simd_store_8x16(ptr, val) _mm_storeu_si128((__m128i *) (ptr), val) -# define bc_simd_add_8x16(a, b) _mm_add_epi8(a, b) -# define bc_simd_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) -# define bc_simd_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) -# define bc_simd_movemask_8x16(a) _mm_movemask_epi8(a) - -#elif defined(__aarch64__) || defined(_M_ARM64) -# include - typedef int8x16_t bc_simd_128_t; -# define HAVE_BC_SIMD_128 -# define bc_simd_set_8x16(x) vdupq_n_s8(x) -# define bc_simd_load_8x16(ptr) vld1q_s8((const int8_t *) (ptr)) -# define bc_simd_xor_8x16(a, b) veorq_s8(a, b) -# define bc_simd_store_8x16(ptr, val) vst1q_s8((int8_t *) (ptr), val) -# define bc_simd_add_8x16(a, b) vaddq_s8(a, b) -# define bc_simd_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) -# define bc_simd_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) - static inline int bc_simd_movemask_8x16(int8x16_t vec) - { - /** - * based on code from - * https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon - */ - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(vec), 7)); - uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); - } -#endif - -#endif diff --git a/ext/bcmath/libbcmath/src/str2num.c b/ext/bcmath/libbcmath/src/str2num.c index 945de0cf60003..1e1be35f066f1 100644 --- a/ext/bcmath/libbcmath/src/str2num.c +++ b/ext/bcmath/libbcmath/src/str2num.c @@ -32,7 +32,7 @@ #include "bcmath.h" #include "convert.h" #include "private.h" -#include "simd.h" +#include "zend_simd.h" #include #include @@ -40,20 +40,20 @@ static inline const char *bc_count_digits(const char *str, const char *end) { /* Process in bulk */ -#ifdef HAVE_BC_SIMD_128 - const bc_simd_128_t offset = bc_simd_set_8x16((signed char) (SCHAR_MIN - '0')); +#ifdef ZEND_HAVE_VECTOR_128 + const __m128i offset = _mm_set1_epi8((signed char) (SCHAR_MIN - '0')); /* we use the less than comparator, so add 1 */ - const bc_simd_128_t threshold = bc_simd_set_8x16(SCHAR_MIN + ('9' + 1 - '0')); + const __m128i threshold = _mm_set1_epi8(SCHAR_MIN + ('9' + 1 - '0')); - while (str + sizeof(bc_simd_128_t) <= end) { - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) str); + while (str + sizeof(__m128i) <= end) { + __m128i bytes = _mm_loadu_si128((const __m128i *) str); /* Wrapping-add the offset to the bytes, such that all bytes below '0' are positive and others are negative. * More specifically, '0' will be -128 and '9' will be -119. */ - bytes = bc_simd_add_8x16(bytes, offset); + bytes = _mm_add_epi8(bytes, offset); /* Now mark all bytes that are <= '9', i.e. <= -119, i.e. < -118, i.e. the threshold. */ - bytes = bc_simd_cmplt_8x16(bytes, threshold); + bytes = _mm_cmplt_epi8(bytes, threshold); - int mask = bc_simd_movemask_8x16(bytes); + int mask = _mm_movemask_epi8(bytes); if (mask != 0xffff) { /* At least one of the bytes is not within range. Move to the first offending byte. */ #ifdef PHP_HAVE_BUILTIN_CTZL @@ -63,7 +63,7 @@ static inline const char *bc_count_digits(const char *str, const char *end) #endif } - str += sizeof(bc_simd_128_t); + str += sizeof(__m128i); } #endif @@ -77,19 +77,19 @@ static inline const char *bc_count_digits(const char *str, const char *end) static inline const char *bc_skip_zero_reverse(const char *scanner, const char *stop) { /* Check in bulk */ -#ifdef HAVE_BC_SIMD_128 - const bc_simd_128_t c_zero_repeat = bc_simd_set_8x16('0'); - while (scanner - sizeof(bc_simd_128_t) >= stop) { - scanner -= sizeof(bc_simd_128_t); - bc_simd_128_t bytes = bc_simd_load_8x16((const bc_simd_128_t *) scanner); +#ifdef ZEND_HAVE_VECTOR_128 + const __m128i c_zero_repeat = _mm_set1_epi8('0'); + while (scanner - sizeof(__m128i) >= stop) { + scanner -= sizeof(__m128i); + __m128i bytes = _mm_loadu_si128((const __m128i *) scanner); /* Checks if all numeric strings are equal to '0'. */ - bytes = bc_simd_cmpeq_8x16(bytes, c_zero_repeat); + bytes = _mm_cmpeq_epi8(bytes, c_zero_repeat); - int mask = bc_simd_movemask_8x16(bytes); + int mask = _mm_movemask_epi8(bytes); /* The probability of having 16 trailing 0s in a row is very low, so we use EXPECTED. */ if (EXPECTED(mask != 0xffff)) { /* Move the pointer back and check each character in loop. */ - scanner += sizeof(bc_simd_128_t); + scanner += sizeof(__m128i); break; } } From 4f0fc896c0b5dd53a9e56f11147c35c8ad8a0a1c Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Tue, 29 Apr 2025 10:33:19 +0900 Subject: [PATCH 07/10] Changed argument type from `int8x16_t` to `__m128i` for type hinting. --- Zend/zend_simd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h index 71ec02a52a064..0739cdb4204e7 100644 --- a/Zend/zend_simd.h +++ b/Zend/zend_simd.h @@ -56,7 +56,7 @@ typedef int8x16_t __m128i; #define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) #define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) -static zend_always_inline int _mm_movemask_epi8(int8x16_t x) +static zend_always_inline int _mm_movemask_epi8(__m128i x) { /** * based on code from From d00d75d8e33d019c27ecff35f0db1aee9d87999d Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Tue, 29 Apr 2025 11:03:48 +0900 Subject: [PATCH 08/10] fixed `_mm_set_epi64` to `_mm_set_epi64x` --- Zend/zend_simd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h index 0739cdb4204e7..61cdf1dbca36e 100644 --- a/Zend/zend_simd.h +++ b/Zend/zend_simd.h @@ -38,7 +38,7 @@ typedef int8x16_t __m128i; (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) }) #define _mm_set_epi32(x0, x1, x2, x3) \ vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) }) -#define _mm_set_epi64(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) +#define _mm_set_epi64x(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) #define _mm_load_si128(x) vld1q_s8((const int8_t *) (x)) #define _mm_loadu_si128(x) _mm_load_si128(x) #define _mm_store_si128(to, x) vst1q_s8((int8_t *) (to), x) From 22c505a47860e3045dd2a742449a48dba9ea96a9 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Tue, 29 Apr 2025 22:58:19 +0900 Subject: [PATCH 09/10] fixed `_mm_srli_si128` and `_mm_slli_si128` --- Zend/zend_simd.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h index 61cdf1dbca36e..76971ae439cf5 100644 --- a/Zend/zend_simd.h +++ b/Zend/zend_simd.h @@ -47,8 +47,13 @@ typedef int8x16_t __m128i; #define _mm_or_si128(a, b) vorrq_s8(a, b) #define _mm_xor_si128(a, b) veorq_s8(a, b) #define _mm_and_si128(a, b) vandq_s8(a, b) -#define _mm_srli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) -#define _mm_slli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) + +#define _mm_slli_si128(x, imm) \ + ((imm) >= 16 ? vdupq_n_s8(0) : \ + vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), 16 - (imm)))) +#define _mm_srli_si128(x, imm) \ + ((imm) >= 16 ? vdupq_n_s8(0) : \ + vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), (imm)))) #define _mm_add_epi8(a, b) vaddq_s8(a, b) From 0f73ba4329dcdd0798d8f5bc390cf723b9fe89d6 Mon Sep 17 00:00:00 2001 From: SakiTakamachi Date: Tue, 29 Apr 2025 23:11:28 +0900 Subject: [PATCH 10/10] fixed `_mm_add_epi8` --- Zend/zend_simd.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Zend/zend_simd.h b/Zend/zend_simd.h index 76971ae439cf5..9ce5e073cc204 100644 --- a/Zend/zend_simd.h +++ b/Zend/zend_simd.h @@ -55,7 +55,11 @@ typedef int8x16_t __m128i; ((imm) >= 16 ? vdupq_n_s8(0) : \ vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), (imm)))) -#define _mm_add_epi8(a, b) vaddq_s8(a, b) +/** + * In practice, there is no problem, but a runtime error for signed integer overflow is triggered by UBSAN, + * so perform the calculation as unsigned. Since it is optimized at compile time, there are no unnecessary casts at runtime. + */ +#define _mm_add_epi8(a, b) vreinterpretq_s8_u8(vaddq_u8(vreinterpretq_u8_s8(a), vreinterpretq_u8_s8(b))) #define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) #define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b)))