diff --git a/clang/lib/Headers/aiev2_upd_ext.h b/clang/lib/Headers/aiev2_upd_ext.h index 610083b07152..dacaac2feb7b 100644 --- a/clang/lib/Headers/aiev2_upd_ext.h +++ b/clang/lib/Headers/aiev2_upd_ext.h @@ -1,35 +1,35 @@ -//===- aiev2_upd_ext.h ------------------------------------------*- C++ -*-===// -// +//===-------------------- AIEngine AIE2 intrinsics ------------------------=== // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// +//* Automatically generated file, do not edit! * +// + #ifndef __AIEV2_UPD_EXT_H__ #define __AIEV2_UPD_EXT_H__ -inline int get_idx(int idx, unsigned int elems) { return (idx & (elems - 1)); } - // Small vector datatypes inline unsigned int set_w32(int idx, unsigned int val, unsigned int elems, int step, unsigned int elem_mask) { - idx = get_idx(idx, elems); + idx = idx & (elems - 1); return ((val & elem_mask) << (idx * step)); } inline mask64 set_w64(int idx, unsigned int val, unsigned int elems, int step, unsigned int elem_mask) { - idx = get_idx(idx, elems); + idx = idx & (elems - 1); return (v2uint32)(((unsigned long long)(val & elem_mask)) << (idx * step)); } inline unsigned int upd_w32(unsigned int a, int idx, unsigned int val, unsigned int elems, int step, unsigned int elem_mask) { - idx = get_idx(idx, elems); + idx = idx & (elems - 1); unsigned int mask = unsigned(elem_mask) << (idx * step); return (a & ~mask) | ((val & elem_mask) << (idx * step)); } @@ -37,36 +37,36 @@ inline unsigned int upd_w32(unsigned int a, int idx, unsigned int val, inline mask64 upd_w64(mask64 a_, int idx, unsigned int val, unsigned int elems, int step, unsigned int elem_mask) { unsigned long long a = (unsigned long long)(a_); - idx = get_idx(idx, elems); + idx = idx & (elems - 1); unsigned long long mask = ((unsigned long long)(elem_mask)) << (idx * step); - return (v2uint32)((a & ~mask) | - (((unsigned long long)(val & elem_mask)) << (idx * step))); + return (v2uint32)(a & ~mask) | + (((unsigned long long)(val & elem_mask)) << (idx * step)); } inline int ext_w32(int a, int idx, unsigned int elems, int step, unsigned int elem_mask) { - idx = get_idx(idx, elems); + idx = idx & (elems - 1); return ((a << (32 - (idx + 1) * step)) >> (32 - step)); } inline int ext_w64(mask64 a_, int idx, unsigned int elems, int step, unsigned int elem_mask) { - long long a = (long long)(a_); - idx = get_idx(idx, elems); + long long a = (unsigned long long)(a_); + idx = idx & (elems - 1); return ((v2int32)((a >> (idx * step)) & elem_mask))[0]; } inline unsigned int ext_u32(unsigned int a, int idx, unsigned int elems, int step, unsigned int elem_mask) { - idx = get_idx(idx, elems); + idx = idx & (elems - 1); return (a >> (idx * step)) & elem_mask; } inline mask64 ext_u64(mask64 a_, int idx, unsigned int elems, int step, unsigned int elem_mask) { unsigned long long a = (unsigned long long)(a_); - idx = get_idx(idx, elems); - return (v2uint32)((a >> (idx * step)) & elem_mask); + idx = idx & (elems - 1); + return ((v2int32)((a >> (idx * step)) & elem_mask)); } inline unsigned int set_v2w4(int idx, unsigned int val) { @@ -177,7 +177,7 @@ inline unsigned int ext_v8u4(unsigned int a, int idx) { return ext_u32(a, idx, 8, 4, 0xf); } inline unsigned int ext_v16u4(mask64 a, int idx) { - return (ext_u64(a, idx, 16, 4, 0xf))[0]; + return ext_u64(a, idx, 16, 4, 0xf)[0]; } inline unsigned int ext_v2u8(unsigned int a, int idx) { @@ -187,20 +187,22 @@ inline unsigned int ext_v4u8(unsigned int a, int idx) { return ext_u32(a, idx, 4, 8, 0xff); } inline unsigned int ext_v8u8(mask64 a, int idx) { - return (ext_u64(a, idx, 8, 8, 0xff))[0]; + return ext_u64(a, idx, 8, 8, 0xff)[0]; } inline unsigned int ext_v2u16(unsigned int a, int idx) { return ext_u32(a, idx, 2, 16, 0xffff); } inline unsigned int ext_v4u16(mask64 a, int idx) { - return (ext_u64(a, idx, 4, 16, 0xffff))[0]; + return ext_u64(a, idx, 4, 16, 0xffff)[0]; } inline unsigned int ext_v2u32(mask64 a, int idx) { - return (ext_u64(a, idx, 2, 32, 0xffffffff))[0]; + return ext_u64(a, idx, 2, 32, 0xffffffff)[0]; } +// Vector datatypes + // Scalar updates and extracts INTRINSIC(unsigned long long) insert(unsigned long long a, int idx, unsigned int b) { @@ -225,2636 +227,1855 @@ INTRINSIC(unsigned long long) concat(unsigned int a, unsigned int b) { return insert(set_u64(a, 0), 1, b); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v64uint4) extract_v64uint4(v128uint4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// Generic extract primitives + +inline v8int32 extract_256_512(v16int32 a, int idx) { + if (idx % 2 == 0) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + return __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15); + } } -// Insert 256-bit in 512-bit register -INTRINSIC(v128uint4) insert(v128uint4 a, int idx, v64uint4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +inline v16int32 extract_512_1024(v32int32 a, int idx) { + if (idx % 2 == 0) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); + } else { + return __builtin_shufflevector(a, a, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31); + } } -// Set 256-bit portion of 512-bit register -INTRINSIC(v128uint4) set_v128uint4(int idx, v64uint4 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +inline v8int32 extract_256_1024(v32int32 a, int idx) { + if (idx % 4 == 0) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7); + } + if (idx % 4 == 1) { + return __builtin_shufflevector(a, a, 8, 9, 10, 11, 12, 13, 14, 15); + } + if (idx % 4 == 2) { + return __builtin_shufflevector(a, a, 16, 17, 18, 19, 20, 21, 22, 23); + } else { + return __builtin_shufflevector(a, a, 24, 25, 26, 27, 28, 29, 30, 31); + } +} + +// Generic insert primitives + +inline v16int32 insert_256_512(v16int32 a, int idx, v8int32 b) { + v8int32 undef_256; + + v16int32 tmp_512; + + tmp_512 = __builtin_shufflevector(b, undef_256, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + + if (idx % 2 == 0) { + return __builtin_shufflevector(tmp_512, a, 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, + 26, 27, 28, 29, 30, 31); + } else { + return __builtin_shufflevector(tmp_512, a, 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 4, 5, 6, 7); + } +} + +inline v32int32 insert_512_1024(v32int32 a, int idx, v16int32 b) { + v16int32 undef_512; + + v32int32 tmp_1024; + + tmp_1024 = __builtin_shufflevector( + b, undef_512, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + + if (idx % 2 == 0) { + return __builtin_shufflevector(tmp_1024, a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + } else { + return __builtin_shufflevector(tmp_1024, a, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + } +} + +inline v32int32 insert_256_1024(v32int32 a, int idx, v8int32 b) { + v8int32 undef_256; + v16int32 undef_512; + + v16int32 tmp_512; + v32int32 tmp_1024; + + tmp_512 = __builtin_shufflevector(b, undef_256, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + tmp_1024 = __builtin_shufflevector( + tmp_512, undef_512, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + + if (idx % 4 == 0) { + return __builtin_shufflevector(tmp_1024, a, 0, 1, 2, 3, 4, 5, 6, 7, 40, 41, + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + } + if (idx % 4 == 1) { + return __builtin_shufflevector(tmp_1024, a, 32, 33, 34, 35, 36, 37, 38, 39, + 0, 1, 2, 3, 4, 5, 6, 7, 48, 49, 50, 51, 52, + 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); + } + if (idx % 4 == 2) { + return __builtin_shufflevector(tmp_1024, a, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 0, 1, 2, 3, + 4, 5, 6, 7, 56, 57, 58, 59, 60, 61, 62, 63); + } else { + return __builtin_shufflevector(tmp_1024, a, 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 0, 1, 2, 3, 4, 5, 6, 7); + } +} + +// Generic set primitives + +inline v16int32 set_256_512(int idx, v8int32 b) { + v8int32 tmp0; + if (idx % 2 == 0) { + return __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + } else { + + return __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + } +} + +inline v32int32 set_512_1024(int idx, v16int32 b) { + v16int32 tmp0; + if (idx % 2 == 0) { + return __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } else { + + return __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } +} + +inline v32int32 set_256_1024(int idx, v8int32 b) { + v8int32 tmp0; + v16int32 tmp1; + v16int32 undef1; + if (idx % 4 == 0) { + tmp1 = __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + return __builtin_shufflevector(tmp1, undef1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } + if (idx % 4 == 1) { + tmp1 = __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + return __builtin_shufflevector(tmp1, undef1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } + if (idx % 4 == 2) { + tmp1 = __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + return __builtin_shufflevector(undef1, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } else { + + tmp1 = __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + return __builtin_shufflevector(undef1, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + } +} + +// Generic concat primitives + +inline v16int32 concat_256_512(v8int32 a0, v8int32 a1) { + + return __builtin_shufflevector(a0, a1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +inline v32int32 concat_512_1024(v16int32 a0, v16int32 a1) { + + return __builtin_shufflevector(a0, a1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31); +} + +inline v32int32 concat_256_1024(v8int32 a0, v8int32 a1, v8int32 a2, + v8int32 a3) { + v16int32 tmp0; + v16int32 tmp1; + + tmp0 = __builtin_shufflevector(a0, a1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); + tmp1 = __builtin_shufflevector(a2, a3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); + + return __builtin_shufflevector(tmp0, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, 31); +} + +// Conversions + +// v128uint4 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v64uint4) extract_v64uint4(v128uint4 a, int idx) { + return extract_256_512(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v128uint4) insert(v128uint4 a, int idx, v64uint4 b) { + return insert_256_512(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v128uint4) set_v128uint4(int idx, v64uint4 b) { + return set_256_512(idx, b); } INTRINSIC(v128uint4) concat(v64uint4 a0, v64uint4 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v64int4) extract_v64int4(v128int4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v128int4 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v64int4) extract_v64int4(v128int4 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v128int4) insert(v128int4 a, int idx, v64int4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v128int4) insert(v128int4 a, int idx, v64int4 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v128int4) set_v128int4(int idx, v64int4 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v128int4) set_v128int4(int idx, v64int4 b) { + return set_256_512(idx, b); } INTRINSIC(v128int4) concat(v64int4 a0, v64int4 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v32uint8) extract_v32uint8(v64uint8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v64uint8 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v32uint8) extract_v32uint8(v64uint8 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v64uint8) insert(v64uint8 a, int idx, v32uint8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v64uint8) insert(v64uint8 a, int idx, v32uint8 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v64uint8) set_v64uint8(int idx, v32uint8 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v64uint8) set_v64uint8(int idx, v32uint8 b) { + return set_256_512(idx, b); } INTRINSIC(v64uint8) concat(v32uint8 a0, v32uint8 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v32int8) extract_v32int8(v64int8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v64int8 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v32int8) extract_v32int8(v64int8 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v64int8) insert(v64int8 a, int idx, v32int8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v64int8) insert(v64int8 a, int idx, v32int8 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v64int8) set_v64int8(int idx, v32int8 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v64int8) set_v64int8(int idx, v32int8 b) { + return set_256_512(idx, b); } INTRINSIC(v64int8) concat(v32int8 a0, v32int8 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v16uint16) extract_v16uint16(v32uint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v32uint16 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v16uint16) extract_v16uint16(v32uint16 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v32uint16) insert(v32uint16 a, int idx, v16uint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v32uint16) insert(v32uint16 a, int idx, v16uint16 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v32uint16) set_v32uint16(int idx, v16uint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v32uint16) set_v32uint16(int idx, v16uint16 b) { + return set_256_512(idx, b); } INTRINSIC(v32uint16) concat(v16uint16 a0, v16uint16 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v16int16) extract_v16int16(v32int16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v32int16 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v16int16) extract_v16int16(v32int16 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v32int16) insert(v32int16 a, int idx, v16int16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v32int16) insert(v32int16 a, int idx, v16int16 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v32int16) set_v32int16(int idx, v16int16 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v32int16) set_v32int16(int idx, v16int16 b) { + return set_256_512(idx, b); } INTRINSIC(v32int16) concat(v16int16 a0, v16int16 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } +// v16cint16 + #if 0 -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8cint16) extract_v8cint16(v16cint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8cint16) extract_v8cint16 (v16cint16 a, int idx) +{ + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16cint16) insert(v16cint16 a, int idx, v8cint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16cint16) insert (v16cint16 a, int idx, v8cint16 b) +{ + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16cint16) set_v16cint16(int idx, v8cint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16cint16) set_v16cint16 (int idx, v8cint16 b) +{ + return set_256_512(idx, b); } -INTRINSIC(v16cint16) concat(v8cint16 a0, v8cint16 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); +INTRINSIC(v16cint16) concat (v8cint16 a0, v8cint16 a1) +{ + return concat_256_512(a0, a1); } + #endif -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8uint32) extract_v8uint32(v16uint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v16uint32 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8uint32) extract_v8uint32(v16uint32 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16uint32) insert(v16uint32 a, int idx, v8uint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16uint32) insert(v16uint32 a, int idx, v8uint32 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16uint32) set_v16uint32(int idx, v8uint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16uint32) set_v16uint32(int idx, v8uint32 b) { + return set_256_512(idx, b); } INTRINSIC(v16uint32) concat(v8uint32 a0, v8uint32 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8int32) extract_v8int32(v16int32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v16int32 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8int32) extract_v8int32(v16int32 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16int32) insert(v16int32 a, int idx, v8int32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16int32) insert(v16int32 a, int idx, v8int32 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16int32) set_v16int32(int idx, v8int32 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16int32) set_v16int32(int idx, v8int32 b) { + return set_256_512(idx, b); } INTRINSIC(v16int32) concat(v8int32 a0, v8int32 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } +// v8cint32 + #if 0 -// Extract 256-bit portion from 512-bit register -INTRINSIC(v4cint32) extract_v4cint32(v8cint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v4cint32) extract_v4cint32 (v8cint32 a, int idx) +{ + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v8cint32) insert(v8cint32 a, int idx, v4cint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v8cint32) insert (v8cint32 a, int idx, v4cint32 b) +{ + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v8cint32) set_v8cint32(int idx, v4cint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v8cint32) set_v8cint32 (int idx, v4cint32 b) +{ + return set_256_512(idx, b); } -INTRINSIC(v8cint32) concat(v4cint32 a0, v4cint32 a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); +INTRINSIC(v8cint32) concat (v4cint32 a0, v4cint32 a1) +{ + return concat_256_512(a0, a1); } + #endif -// Extract 256-bit portion from 512-bit register -INTRINSIC(v16bfloat16) extract_v16bfloat16(v32bfloat16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_bf256_bf512(a, 0); - else - return __builtin_aiev2_ext_bf256_bf512(a, 1); + +// v32bfloat16 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v16bfloat16) extract_v16bfloat16(v32bfloat16 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v32bfloat16) insert(v32bfloat16 a, int idx, v16bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_upd_bf512_bf256(a, b, 0); - else - return __builtin_aiev2_upd_bf512_bf256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v32bfloat16) insert(v32bfloat16 a, int idx, v16bfloat16 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v32bfloat16) set_v32bfloat16(int idx, v16bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_set_bf512_bf256(b, 0); - else - return __builtin_aiev2_set_bf512_bf256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v32bfloat16) set_v32bfloat16(int idx, v16bfloat16 b) { + return set_256_512(idx, b); } INTRINSIC(v32bfloat16) concat(v16bfloat16 a0, v16bfloat16 a1) { - return __builtin_aiev2_concat_bf512_bf256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8accfloat) extract_v8accfloat(v16accfloat a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC512(a, 0); - else - return __builtin_aiev2_ext_ACC256_ACC512(a, 1); +// v16accfloat + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8accfloat) extract_v8accfloat(v16accfloat a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16accfloat) insert(v16accfloat a, int idx, v8accfloat b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 0); - else - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16accfloat) insert(v16accfloat a, int idx, v8accfloat b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16accfloat) set_v16accfloat(int idx, v8accfloat b) { - if (idx == 0) - return __builtin_aiev2_set_ACC512_ACC256(b, 0); - else - return __builtin_aiev2_set_ACC512_ACC256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16accfloat) set_v16accfloat(int idx, v8accfloat b) { + return set_256_512(idx, b); } INTRINSIC(v16accfloat) concat(v8accfloat a0, v8accfloat a1) { - return __builtin_aiev2_concat_ACC512_ACC256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8float) extract_v8float(v16float a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I512(a, 0); - else - return __builtin_aiev2_ext_I256_I512(a, 1); +// v16float + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8float) extract_v8float(v16float a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16float) insert(v16float a, int idx, v8float b) { - if (idx == 0) - return __builtin_aiev2_upd_I512_I256(a, b, 0); - else - return __builtin_aiev2_upd_I512_I256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16float) insert(v16float a, int idx, v8float b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16float) set_v16float(int idx, v8float b) { - if (idx == 0) - return __builtin_aiev2_set_I512_I256(b, 0); - else - return __builtin_aiev2_set_I512_I256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16float) set_v16float(int idx, v8float b) { + return set_256_512(idx, b); } INTRINSIC(v16float) concat(v8float a0, v8float a1) { - return __builtin_aiev2_concat_I512_I256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v8acc32) extract_v8acc32(v16acc32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC512(a, 0); - else - return __builtin_aiev2_ext_ACC256_ACC512(a, 1); +// v16acc32 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v8acc32) extract_v8acc32(v16acc32 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v16acc32) insert(v16acc32 a, int idx, v8acc32 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 0); - else - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v16acc32) insert(v16acc32 a, int idx, v8acc32 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v16acc32) set_v16acc32(int idx, v8acc32 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC512_ACC256(b, 0); - else - return __builtin_aiev2_set_ACC512_ACC256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v16acc32) set_v16acc32(int idx, v8acc32 b) { + return set_256_512(idx, b); } INTRINSIC(v16acc32) concat(v8acc32 a0, v8acc32 a1) { - return __builtin_aiev2_concat_ACC512_ACC256(a0, a1); + return concat_256_512(a0, a1); } -// Extract 256-bit portion from 512-bit register -INTRINSIC(v4acc64) extract_v4acc64(v8acc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC512(a, 0); - else - return __builtin_aiev2_ext_ACC256_ACC512(a, 1); +// v8acc64 + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v4acc64) extract_v4acc64(v8acc64 a, int idx) { + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v8acc64) insert(v8acc64 a, int idx, v4acc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 0); - else - return __builtin_aiev2_upd_ACC512_ACC256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v8acc64) insert(v8acc64 a, int idx, v4acc64 b) { + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v8acc64) set_v8acc64(int idx, v4acc64 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC512_ACC256(b, 0); - else - return __builtin_aiev2_set_ACC512_ACC256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v8acc64) set_v8acc64(int idx, v4acc64 b) { + return set_256_512(idx, b); } INTRINSIC(v8acc64) concat(v4acc64 a0, v4acc64 a1) { - return __builtin_aiev2_concat_ACC512_ACC256(a0, a1); + return concat_256_512(a0, a1); } +// v8cfloat + #if 0 -// Extract 256-bit portion from 512-bit register -INTRINSIC(v2cacc64) extract_v2cacc64(v4cacc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_b_256_512(a, 0); - else - return __builtin_aiev2_ext_b_256_512(a, 1); + +//! @name Extract 256-bit portion from 512-bit register +inline INTRINSIC(v4cfloat) extract_v4cfloat (v8cfloat a, int idx) +{ + return extract_256_512(a, idx); } -// Insert 256-bit in 512-bit register -INTRINSIC(v4cacc64) insert(v4cacc64 a, int idx, v2cacc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_b_512_256(a, b, 0); - else - return __builtin_aiev2_upd_b_512_256(a, b, 1); +//! @name Insert 256-bit in 512-bit register +inline INTRINSIC(v8cfloat) insert (v8cfloat a, int idx, v4cfloat b) +{ + return insert_256_512(a, idx, b); } -// Set 256-bit portion of 512-bit register -INTRINSIC(v4cacc64) set_v4cacc64(int idx, v2cacc64 b) { - if (idx == 0) - return __builtin_aiev2_set_b_512_256(b, 0); - else - return __builtin_aiev2_set_b_512_256(b, 1); +//! @name Set 256-bit portion of 512-bit register +inline INTRINSIC(v8cfloat) set_v8cfloat (int idx, v4cfloat b) +{ + return set_256_512(idx, b); } -INTRINSIC(v4cacc64) concat(v2cacc64 a0, v2cacc64 a1) { - return __builtin_aiev2_concat_bm_am(a0, a1); +INTRINSIC(v8cfloat) concat (v4cfloat a0, v4cfloat a1) +{ + return concat_256_512(a0, a1); } + #endif -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v64uint4) extract_v64uint4(v256uint4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v256uint4 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v64uint4) extract_v64uint4(v256uint4 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v256uint4) insert(v256uint4 a, int idx, v64uint4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v256uint4) insert(v256uint4 a, int idx, v64uint4 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v256uint4) set_v256uint4(int idx, v64uint4 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v256uint4) set_v256uint4(int idx, v64uint4 b) { + return set_256_1024(idx, b); } INTRINSIC(v256uint4) concat(v64uint4 a0, v64uint4 a1, v64uint4 a2, v64uint4 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v128uint4) extract_v128uint4(v256uint4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v256uint4 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v128uint4) extract_v128uint4(v256uint4 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v256uint4) insert(v256uint4 a, int idx, v128uint4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v256uint4) insert(v256uint4 a, int idx, v128uint4 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v256uint4) set_v256uint4(int idx, v128uint4 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v256uint4) set_v256uint4(int idx, v128uint4 b) { + return set_512_1024(idx, b); } INTRINSIC(v256uint4) concat(v128uint4 a0, v128uint4 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v64int4) extract_v64int4(v256int4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v256int4 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v64int4) extract_v64int4(v256int4 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v256int4) insert(v256int4 a, int idx, v64int4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v256int4) insert(v256int4 a, int idx, v64int4 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v256int4) set_v256int4(int idx, v64int4 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v256int4) set_v256int4(int idx, v64int4 b) { + return set_256_1024(idx, b); } INTRINSIC(v256int4) concat(v64int4 a0, v64int4 a1, v64int4 a2, v64int4 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v128int4) extract_v128int4(v256int4 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v256int4 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v128int4) extract_v128int4(v256int4 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v256int4) insert(v256int4 a, int idx, v128int4 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v256int4) insert(v256int4 a, int idx, v128int4 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v256int4) set_v256int4(int idx, v128int4 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v256int4) set_v256int4(int idx, v128int4 b) { + return set_512_1024(idx, b); } INTRINSIC(v256int4) concat(v128int4 a0, v128int4 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v32uint8) extract_v32uint8(v128uint8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v128uint8 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v32uint8) extract_v32uint8(v128uint8 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v128uint8) insert(v128uint8 a, int idx, v32uint8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v128uint8) insert(v128uint8 a, int idx, v32uint8 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v128uint8) set_v128uint8(int idx, v32uint8 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v128uint8) set_v128uint8(int idx, v32uint8 b) { + return set_256_1024(idx, b); } INTRINSIC(v128uint8) concat(v32uint8 a0, v32uint8 a1, v32uint8 a2, v32uint8 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v64uint8) extract_v64uint8(v128uint8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v128uint8 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v64uint8) extract_v64uint8(v128uint8 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v128uint8) insert(v128uint8 a, int idx, v64uint8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v128uint8) insert(v128uint8 a, int idx, v64uint8 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v128uint8) set_v128uint8(int idx, v64uint8 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v128uint8) set_v128uint8(int idx, v64uint8 b) { + return set_512_1024(idx, b); } INTRINSIC(v128uint8) concat(v64uint8 a0, v64uint8 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v32int8) extract_v32int8(v128int8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v128int8 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v32int8) extract_v32int8(v128int8 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v128int8) insert(v128int8 a, int idx, v32int8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v128int8) insert(v128int8 a, int idx, v32int8 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v128int8) set_v128int8(int idx, v32int8 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v128int8) set_v128int8(int idx, v32int8 b) { + return set_256_1024(idx, b); } INTRINSIC(v128int8) concat(v32int8 a0, v32int8 a1, v32int8 a2, v32int8 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v64int8) extract_v64int8(v128int8 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v128int8 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v64int8) extract_v64int8(v128int8 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v128int8) insert(v128int8 a, int idx, v64int8 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v128int8) insert(v128int8 a, int idx, v64int8 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v128int8) set_v128int8(int idx, v64int8 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v128int8) set_v128int8(int idx, v64int8 b) { + return set_512_1024(idx, b); } INTRINSIC(v128int8) concat(v64int8 a0, v64int8 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v16uint16) extract_v16uint16(v64uint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v64uint16 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v16uint16) extract_v16uint16(v64uint16 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v64uint16) insert(v64uint16 a, int idx, v16uint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v64uint16) insert(v64uint16 a, int idx, v16uint16 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v64uint16) set_v64uint16(int idx, v16uint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v64uint16) set_v64uint16(int idx, v16uint16 b) { + return set_256_1024(idx, b); } INTRINSIC(v64uint16) concat(v16uint16 a0, v16uint16 a1, v16uint16 a2, v16uint16 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v32uint16) extract_v32uint16(v64uint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v64uint16 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v32uint16) extract_v32uint16(v64uint16 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v64uint16) insert(v64uint16 a, int idx, v32uint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v64uint16) insert(v64uint16 a, int idx, v32uint16 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v64uint16) set_v64uint16(int idx, v32uint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v64uint16) set_v64uint16(int idx, v32uint16 b) { + return set_512_1024(idx, b); } INTRINSIC(v64uint16) concat(v32uint16 a0, v32uint16 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v16int16) extract_v16int16(v64int16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v64int16 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v16int16) extract_v16int16(v64int16 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v64int16) insert(v64int16 a, int idx, v16int16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v64int16) insert(v64int16 a, int idx, v16int16 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v64int16) set_v64int16(int idx, v16int16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v64int16) set_v64int16(int idx, v16int16 b) { + return set_256_1024(idx, b); } INTRINSIC(v64int16) concat(v16int16 a0, v16int16 a1, v16int16 a2, v16int16 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v32int16) extract_v32int16(v64int16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v64int16 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v32int16) extract_v32int16(v64int16 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v64int16) insert(v64int16 a, int idx, v32int16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v64int16) insert(v64int16 a, int idx, v32int16 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v64int16) set_v64int16(int idx, v32int16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v64int16) set_v64int16(int idx, v32int16 b) { + return set_512_1024(idx, b); } INTRINSIC(v64int16) concat(v32int16 a0, v32int16 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } +// v32cint16 + #if 0 -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8cint16) extract_v8cint16(v32cint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8cint16) extract_v8cint16 (v32cint16 a, int idx) +{ + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32cint16) insert(v32cint16 a, int idx, v8cint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32cint16) insert (v32cint16 a, int idx, v8cint16 b) +{ + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32cint16) set_v32cint16(int idx, v8cint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32cint16) set_v32cint16 (int idx, v8cint16 b) +{ + return set_256_1024(idx, b); } -INTRINSIC(v32cint16) -concat(v8cint16 a0, v8cint16 a1, v8cint16 a2, v8cint16 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); +INTRINSIC(v32cint16) concat (v8cint16 a0, v8cint16 a1, v8cint16 a2, v8cint16 a3) +{ + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16cint16) extract_v16cint16(v32cint16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +#endif + +// v32cint16 + +#if 0 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16cint16) extract_v16cint16 (v32cint16 a, int idx) +{ + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32cint16) insert(v32cint16 a, int idx, v16cint16 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32cint16) insert (v32cint16 a, int idx, v16cint16 b) +{ + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32cint16) set_v32cint16(int idx, v16cint16 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32cint16) set_v32cint16 (int idx, v16cint16 b) +{ + return set_512_1024(idx, b); } -INTRINSIC(v32cint16) concat(v16cint16 a0, v16cint16 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); +INTRINSIC(v32cint16) concat (v16cint16 a0, v16cint16 a1) +{ + return concat_512_1024(a0, a1); } + #endif -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8uint32) extract_v8uint32(v32uint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v32uint32 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8uint32) extract_v8uint32(v32uint32 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32uint32) insert(v32uint32 a, int idx, v8uint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32uint32) insert(v32uint32 a, int idx, v8uint32 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32uint32) set_v32uint32(int idx, v8uint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32uint32) set_v32uint32(int idx, v8uint32 b) { + return set_256_1024(idx, b); } INTRINSIC(v32uint32) concat(v8uint32 a0, v8uint32 a1, v8uint32 a2, v8uint32 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16uint32) extract_v16uint32(v32uint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v32uint32 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16uint32) extract_v16uint32(v32uint32 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32uint32) insert(v32uint32 a, int idx, v16uint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32uint32) insert(v32uint32 a, int idx, v16uint32 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32uint32) set_v32uint32(int idx, v16uint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32uint32) set_v32uint32(int idx, v16uint32 b) { + return set_512_1024(idx, b); } INTRINSIC(v32uint32) concat(v16uint32 a0, v16uint32 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8int32) extract_v8int32(v32int32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v32int32 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8int32) extract_v8int32(v32int32 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32int32) insert(v32int32 a, int idx, v8int32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32int32) insert(v32int32 a, int idx, v8int32 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32int32) set_v32int32(int idx, v8int32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32int32) set_v32int32(int idx, v8int32 b) { + return set_256_1024(idx, b); } INTRINSIC(v32int32) concat(v8int32 a0, v8int32 a1, v8int32 a2, v8int32 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16int32) extract_v16int32(v32int32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v32int32 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16int32) extract_v16int32(v32int32 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32int32) insert(v32int32 a, int idx, v16int32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32int32) insert(v32int32 a, int idx, v16int32 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32int32) set_v32int32(int idx, v16int32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32int32) set_v32int32(int idx, v16int32 b) { + return set_512_1024(idx, b); } INTRINSIC(v32int32) concat(v16int32 a0, v16int32 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } +// v16cint32 + #if 0 -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v4cint32) extract_v4cint32(v16cint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v4cint32) extract_v4cint32 (v16cint32 a, int idx) +{ + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v16cint32) insert(v16cint32 a, int idx, v4cint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v16cint32) insert (v16cint32 a, int idx, v4cint32 b) +{ + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v16cint32) set_v16cint32(int idx, v4cint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v16cint32) set_v16cint32 (int idx, v4cint32 b) +{ + return set_256_1024(idx, b); } -INTRINSIC(v16cint32) -concat(v4cint32 a0, v4cint32 a1, v4cint32 a2, v4cint32 a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); +INTRINSIC(v16cint32) concat (v4cint32 a0, v4cint32 a1, v4cint32 a2, v4cint32 a3) +{ + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v8cint32) extract_v8cint32(v16cint32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +#endif + +// v16cint32 + +#if 0 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v8cint32) extract_v8cint32 (v16cint32 a, int idx) +{ + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v16cint32) insert(v16cint32 a, int idx, v8cint32 b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v16cint32) insert (v16cint32 a, int idx, v8cint32 b) +{ + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v16cint32) set_v16cint32(int idx, v8cint32 b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v16cint32) set_v16cint32 (int idx, v8cint32 b) +{ + return set_512_1024(idx, b); } -INTRINSIC(v16cint32) concat(v8cint32 a0, v8cint32 a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); +INTRINSIC(v16cint32) concat (v8cint32 a0, v8cint32 a1) +{ + return concat_512_1024(a0, a1); } + #endif -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v16bfloat16) extract_v16bfloat16(v64bfloat16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_bf256_bf1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_bf256_bf1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_bf256_bf1024(a, 2); - else - return __builtin_aiev2_ext_bf256_bf1024(a, 3); +// v64bfloat16 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v16bfloat16) extract_v16bfloat16(v64bfloat16 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v64bfloat16) insert(v64bfloat16 a, int idx, v16bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_upd_bf1024_bf256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_bf1024_bf256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_bf1024_bf256(a, b, 2); - else - return __builtin_aiev2_upd_bf1024_bf256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v64bfloat16) insert(v64bfloat16 a, int idx, v16bfloat16 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v64bfloat16) set_v64bfloat16(int idx, v16bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_set_bf1024_bf256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_bf1024_bf256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_bf1024_bf256(b, 2); - else - return __builtin_aiev2_set_bf1024_bf256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v64bfloat16) set_v64bfloat16(int idx, v16bfloat16 b) { + return set_256_1024(idx, b); } INTRINSIC(v64bfloat16) concat(v16bfloat16 a0, v16bfloat16 a1, v16bfloat16 a2, v16bfloat16 a3) { - return __builtin_aiev2_concat_bf1024_bf256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v32bfloat16) extract_v32bfloat16(v64bfloat16 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_bf512_bf1024(a, 0); - else - return __builtin_aiev2_ext_bf512_bf1024(a, 1); +// v64bfloat16 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v32bfloat16) extract_v32bfloat16(v64bfloat16 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v64bfloat16) insert(v64bfloat16 a, int idx, v32bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_upd_bf1024_bf512(a, b, 0); - else - return __builtin_aiev2_upd_bf1024_bf512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v64bfloat16) insert(v64bfloat16 a, int idx, v32bfloat16 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v64bfloat16) set_v64bfloat16(int idx, v32bfloat16 b) { - if (idx == 0) - return __builtin_aiev2_set_bf1024_bf512(b, 0); - else - return __builtin_aiev2_set_bf1024_bf512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v64bfloat16) set_v64bfloat16(int idx, v32bfloat16 b) { + return set_512_1024(idx, b); } INTRINSIC(v64bfloat16) concat(v32bfloat16 a0, v32bfloat16 a1) { - return __builtin_aiev2_concat_bf1024_bf512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8accfloat) extract_v8accfloat(v32accfloat a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 2); - else - return __builtin_aiev2_ext_ACC256_ACC1024(a, 3); +// v32accfloat + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8accfloat) extract_v8accfloat(v32accfloat a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32accfloat) insert(v32accfloat a, int idx, v8accfloat b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 2); - else - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32accfloat) insert(v32accfloat a, int idx, v8accfloat b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32accfloat) set_v32accfloat(int idx, v8accfloat b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_ACC1024_ACC256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_ACC1024_ACC256(b, 2); - else - return __builtin_aiev2_set_ACC1024_ACC256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32accfloat) set_v32accfloat(int idx, v8accfloat b) { + return set_256_1024(idx, b); } INTRINSIC(v32accfloat) concat(v8accfloat a0, v8accfloat a1, v8accfloat a2, v8accfloat a3) { - return __builtin_aiev2_concat_ACC1024_ACC256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16accfloat) extract_v16accfloat(v32accfloat a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC512_ACC1024(a, 0); - else - return __builtin_aiev2_ext_ACC512_ACC1024(a, 1); +// v32accfloat + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16accfloat) extract_v16accfloat(v32accfloat a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32accfloat) insert(v32accfloat a, int idx, v16accfloat b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 0); - else - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32accfloat) insert(v32accfloat a, int idx, v16accfloat b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32accfloat) set_v32accfloat(int idx, v16accfloat b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC512(b, 0); - else - return __builtin_aiev2_set_ACC1024_ACC512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32accfloat) set_v32accfloat(int idx, v16accfloat b) { + return set_512_1024(idx, b); } INTRINSIC(v32accfloat) concat(v16accfloat a0, v16accfloat a1) { - return __builtin_aiev2_concat_ACC1024_ACC512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8float) extract_v8float(v32float a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I256_I1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_I256_I1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_I256_I1024(a, 2); - else - return __builtin_aiev2_ext_I256_I1024(a, 3); +// v32float + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8float) extract_v8float(v32float a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32float) insert(v32float a, int idx, v8float b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_I1024_I256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_I1024_I256(a, b, 2); - else - return __builtin_aiev2_upd_I1024_I256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32float) insert(v32float a, int idx, v8float b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32float) set_v32float(int idx, v8float b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_I1024_I256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_I1024_I256(b, 2); - else - return __builtin_aiev2_set_I1024_I256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32float) set_v32float(int idx, v8float b) { + return set_256_1024(idx, b); } INTRINSIC(v32float) concat(v8float a0, v8float a1, v8float a2, v8float a3) { - return __builtin_aiev2_concat_I1024_I256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16float) extract_v16float(v32float a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_I512_I1024(a, 0); - else - return __builtin_aiev2_ext_I512_I1024(a, 1); +// v32float + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16float) extract_v16float(v32float a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32float) insert(v32float a, int idx, v16float b) { - if (idx == 0) - return __builtin_aiev2_upd_I1024_I512(a, b, 0); - else - return __builtin_aiev2_upd_I1024_I512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32float) insert(v32float a, int idx, v16float b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32float) set_v32float(int idx, v16float b) { - if (idx == 0) - return __builtin_aiev2_set_I1024_I512(b, 0); - else - return __builtin_aiev2_set_I1024_I512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32float) set_v32float(int idx, v16float b) { + return set_512_1024(idx, b); } INTRINSIC(v32float) concat(v16float a0, v16float a1) { - return __builtin_aiev2_concat_I1024_I512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v8acc32) extract_v8acc32(v32acc32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 2); - else - return __builtin_aiev2_ext_ACC256_ACC1024(a, 3); +// v32acc32 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v8acc32) extract_v8acc32(v32acc32 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v32acc32) insert(v32acc32 a, int idx, v8acc32 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 2); - else - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v32acc32) insert(v32acc32 a, int idx, v8acc32 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v32acc32) set_v32acc32(int idx, v8acc32 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_ACC1024_ACC256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_ACC1024_ACC256(b, 2); - else - return __builtin_aiev2_set_ACC1024_ACC256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v32acc32) set_v32acc32(int idx, v8acc32 b) { + return set_256_1024(idx, b); } INTRINSIC(v32acc32) concat(v8acc32 a0, v8acc32 a1, v8acc32 a2, v8acc32 a3) { - return __builtin_aiev2_concat_ACC1024_ACC256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v16acc32) extract_v16acc32(v32acc32 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC512_ACC1024(a, 0); - else - return __builtin_aiev2_ext_ACC512_ACC1024(a, 1); +// v32acc32 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v16acc32) extract_v16acc32(v32acc32 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v32acc32) insert(v32acc32 a, int idx, v16acc32 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 0); - else - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v32acc32) insert(v32acc32 a, int idx, v16acc32 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v32acc32) set_v32acc32(int idx, v16acc32 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC512(b, 0); - else - return __builtin_aiev2_set_ACC1024_ACC512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v32acc32) set_v32acc32(int idx, v16acc32 b) { + return set_512_1024(idx, b); } INTRINSIC(v32acc32) concat(v16acc32 a0, v16acc32 a1) { - return __builtin_aiev2_concat_ACC1024_ACC512(a0, a1); + return concat_512_1024(a0, a1); } -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v4acc64) extract_v4acc64(v16acc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_ACC256_ACC1024(a, 2); - else - return __builtin_aiev2_ext_ACC256_ACC1024(a, 3); +// v16acc64 + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v4acc64) extract_v4acc64(v16acc64 a, int idx) { + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v16acc64) insert(v16acc64 a, int idx, v4acc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 2); - else - return __builtin_aiev2_upd_ACC1024_ACC256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v16acc64) insert(v16acc64 a, int idx, v4acc64 b) { + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v16acc64) set_v16acc64(int idx, v4acc64 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_ACC1024_ACC256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_ACC1024_ACC256(b, 2); - else - return __builtin_aiev2_set_ACC1024_ACC256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v16acc64) set_v16acc64(int idx, v4acc64 b) { + return set_256_1024(idx, b); } INTRINSIC(v16acc64) concat(v4acc64 a0, v4acc64 a1, v4acc64 a2, v4acc64 a3) { - return __builtin_aiev2_concat_ACC1024_ACC256(a0, a1, a2, a3); + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v8acc64) extract_v8acc64(v16acc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_ACC512_ACC1024(a, 0); - else - return __builtin_aiev2_ext_ACC512_ACC1024(a, 1); +// v16acc64 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v8acc64) extract_v8acc64(v16acc64 a, int idx) { + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v16acc64) insert(v16acc64 a, int idx, v8acc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 0); - else - return __builtin_aiev2_upd_ACC1024_ACC512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v16acc64) insert(v16acc64 a, int idx, v8acc64 b) { + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v16acc64) set_v16acc64(int idx, v8acc64 b) { - if (idx == 0) - return __builtin_aiev2_set_ACC1024_ACC512(b, 0); - else - return __builtin_aiev2_set_ACC1024_ACC512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v16acc64) set_v16acc64(int idx, v8acc64 b) { + return set_512_1024(idx, b); } INTRINSIC(v16acc64) concat(v8acc64 a0, v8acc64 a1) { - return __builtin_aiev2_concat_ACC1024_ACC512(a0, a1); + return concat_512_1024(a0, a1); } +// v16cfloat + #if 0 -// Extract 256-bit portion from 1024-bit register -INTRINSIC(v2cacc64) extract_v2cacc64(v8cacc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_c_256_1024(a, 0); - else if (idx == 1) - return __builtin_aiev2_ext_c_256_1024(a, 1); - else if (idx == 2) - return __builtin_aiev2_ext_c_256_1024(a, 2); - else - return __builtin_aiev2_ext_c_256_1024(a, 3); + +//! @name Extract 256-bit portion from 1024-bit register +inline INTRINSIC(v4cfloat) extract_v4cfloat (v16cfloat a, int idx) +{ + return extract_256_1024(a, idx); } -// Insert 256-bit in 1024-bit register -INTRINSIC(v8cacc64) insert(v8cacc64 a, int idx, v2cacc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_c_1024_256(a, b, 0); - else if (idx == 1) - return __builtin_aiev2_upd_c_1024_256(a, b, 1); - else if (idx == 2) - return __builtin_aiev2_upd_c_1024_256(a, b, 2); - else - return __builtin_aiev2_upd_c_1024_256(a, b, 3); +//! @name Insert 256-bit in 1024-bit register +inline INTRINSIC(v16cfloat) insert (v16cfloat a, int idx, v4cfloat b) +{ + return insert_256_1024(a, idx, b); } -// Set 256-bit portion of 1024-bit register -INTRINSIC(v8cacc64) set_v8cacc64(int idx, v2cacc64 b) { - if (idx == 0) - return __builtin_aiev2_set_c_1024_256(b, 0); - else if (idx == 1) - return __builtin_aiev2_set_c_1024_256(b, 1); - else if (idx == 2) - return __builtin_aiev2_set_c_1024_256(b, 2); - else - return __builtin_aiev2_set_c_1024_256(b, 3); +//! @name Set 256-bit portion of 1024-bit register +inline INTRINSIC(v16cfloat) set_v16cfloat (int idx, v4cfloat b) +{ + return set_256_1024(idx, b); } -INTRINSIC(v8cacc64) concat(v2cacc64 a0, v2cacc64 a1, v2cacc64 a2, v2cacc64 a3) { - return __builtin_aiev2_concat_cm_am(a0, a1, a2, a3); +INTRINSIC(v16cfloat) concat (v4cfloat a0, v4cfloat a1, v4cfloat a2, v4cfloat a3) +{ + return concat_256_1024(a0, a1, a2, a3); } -// Extract 512-bit portion from 1024-bit register -INTRINSIC(v4cacc64) extract_v4cacc64(v8cacc64 a, int idx) { - if (idx == 0) - return __builtin_aiev2_ext_c_512_1024(a, 0); - else - return __builtin_aiev2_ext_c_512_1024(a, 1); +#endif + +// v16cfloat + +#if 0 + +//! @name Extract 512-bit portion from 1024-bit register +inline INTRINSIC(v8cfloat) extract_v8cfloat (v16cfloat a, int idx) +{ + return extract_512_1024(a, idx); } -// Insert 512-bit in 1024-bit register -INTRINSIC(v8cacc64) insert(v8cacc64 a, int idx, v4cacc64 b) { - if (idx == 0) - return __builtin_aiev2_upd_c_1024_512(a, b, 0); - else - return __builtin_aiev2_upd_c_1024_512(a, b, 1); +//! @name Insert 512-bit in 1024-bit register +inline INTRINSIC(v16cfloat) insert (v16cfloat a, int idx, v8cfloat b) +{ + return insert_512_1024(a, idx, b); } -// Set 512-bit portion of 1024-bit register -INTRINSIC(v8cacc64) set_v8cacc64(int idx, v4cacc64 b) { - if (idx == 0) - return __builtin_aiev2_set_c_1024_512(b, 0); - else - return __builtin_aiev2_set_c_1024_512(b, 1); +//! @name Set 512-bit portion of 1024-bit register +inline INTRINSIC(v16cfloat) set_v16cfloat (int idx, v8cfloat b) +{ + return set_512_1024(idx, b); } -INTRINSIC(v8cacc64) concat(v4cacc64 a0, v4cacc64 a1) { - return __builtin_aiev2_concat_cm_bm(a0, a1); +INTRINSIC(v16cfloat) concat (v8cfloat a0, v8cfloat a1) +{ + return concat_512_1024(a0, a1); } + #endif +// Generic 128-bit extract primitives + +inline v4int32 extract_128_256(v8int32 a, int idx) { + if (idx % 2 == 0) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3); + } else { + return __builtin_shufflevector(a, a, 4, 5, 6, 7); + } +} + +inline v4int32 extract_128_512(v16int32 a, int idx) { + if (idx % 4 == 0) { + return __builtin_shufflevector(a, a, 0, 1, 2, 3); + } + if (idx % 4 == 1) { + return __builtin_shufflevector(a, a, 4, 5, 6, 7); + } + if (idx % 4 == 2) { + return __builtin_shufflevector(a, a, 8, 9, 10, 11); + } else { + return __builtin_shufflevector(a, a, 12, 13, 14, 15); + } +} + +// Generic 128-bit insert primitives + +inline v8int32 insert_128_256(v8int32 a, int idx, v4int32 b) { + v4int32 undef_128; + + v8int32 tmp_256; + + tmp_256 = __builtin_shufflevector(b, undef_128, 0, 1, 2, 3, 4, 5, 6, 7); + + if (idx % 2 == 0) { + return __builtin_shufflevector(tmp_256, a, 0, 1, 2, 3, 12, 13, 14, 15); + } else { + return __builtin_shufflevector(tmp_256, a, 8, 9, 10, 11, 0, 1, 2, 3); + } +} + +inline v16int32 insert_128_512(v16int32 a, int idx, v4int32 b) { + v4int32 undef_128; + v8int32 undef_256; + + v8int32 tmp_256; + v16int32 tmp_512; + + tmp_256 = __builtin_shufflevector(b, undef_128, 0, 1, 2, 3, 4, 5, 6, 7); + tmp_512 = __builtin_shufflevector(tmp_256, undef_256, 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + + if (idx % 4 == 0) { + return __builtin_shufflevector(tmp_512, a, 0, 1, 2, 3, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31); + } + if (idx % 4 == 1) { + return __builtin_shufflevector(tmp_512, a, 16, 17, 18, 19, 0, 1, 2, 3, 24, + 25, 26, 27, 28, 29, 30, 31); + } + if (idx % 4 == 2) { + return __builtin_shufflevector(tmp_512, a, 16, 17, 18, 19, 20, 21, 22, 23, + 0, 1, 2, 3, 28, 29, 30, 31); + } else { + return __builtin_shufflevector(tmp_512, a, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 0, 1, 2, 3); + } +} + +// Generic 128-bit set primitives + +inline v8int32 set_128_256(int idx, v4int32 b) { + v4int32 tmp0; + if (idx % 2 == 0) { + return __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7); + } else { + + return __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7); + } +} + +inline v16int32 set_128_512(int idx, v4int32 b) { + v4int32 tmp0; + v8int32 tmp1; + v8int32 undef1; + if (idx % 4 == 0) { + tmp1 = __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7); + return __builtin_shufflevector(tmp1, undef1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + } + if (idx % 4 == 1) { + tmp1 = __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7); + return __builtin_shufflevector(tmp1, undef1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + } + if (idx % 4 == 2) { + tmp1 = __builtin_shufflevector(b, tmp0, 0, 1, 2, 3, 4, 5, 6, 7); + return __builtin_shufflevector(undef1, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + } else { + + tmp1 = __builtin_shufflevector(tmp0, b, 0, 1, 2, 3, 4, 5, 6, 7); + return __builtin_shufflevector(undef1, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15); + } +} + +// Generic 128-bit concat primitives + +inline v8int32 concat_128_256(v4int32 a0, v4int32 a1) { + + return __builtin_shufflevector(a0, a1, 0, 1, 2, 3, 4, 5, 6, 7); +} + +inline v16int32 concat_128_512(v4int32 a0, v4int32 a1, v4int32 a2, v4int32 a3) { + v8int32 tmp0; + v8int32 tmp1; + + tmp0 = __builtin_shufflevector(a0, a1, 0, 1, 2, 3, 4, 5, 6, 7); + tmp1 = __builtin_shufflevector(a2, a3, 0, 1, 2, 3, 4, 5, 6, 7); + + return __builtin_shufflevector(tmp0, tmp1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); +} + #define DIAGNOSE_EXTRACT_IDX(MAX) \ __attribute__((diagnose_if(idx < 0 || idx > MAX, \ "index out of range [0," #MAX "]", "error"))) -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v32uint4) extract_v32uint4(v128uint4 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v128uint4 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v128uint4(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v32int4) extract_v32int4(v128int4 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v128int4 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v128int4(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v16uint8) extract_v16uint8(v64uint8 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v64uint8 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v64uint8(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v16int8) extract_v16int8(v64int8 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v64int8 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v64int8(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v8uint16) extract_v8uint16(v32uint16 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v32uint16 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v32uint16(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v8int16) extract_v8int16(v32int16 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v32int16 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v32int16(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } -#if 0 -INTRINSIC(v4cint16) extract_v4cint16(v16cint16 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v16cint16 dest = __builtin_aiev2_vshift_I512_I512(a, undef_v16cint16(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); -} -#endif - +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4uint32) extract_v4uint32(v16uint32 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v16uint32 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v16uint32(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4int32) extract_v4int32(v16int32 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v16int32(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } -#if 0 -INTRINSIC(v2cint32) extract_v2cint32(v8cint32 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v8cint32 dest = __builtin_aiev2_vshift_I512_I512(a, undef_v8cint32(), - 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); +//! @name Extract 128-bit portion from 256-bit register +INTRINSIC(v8bfloat16) +extract_v8bfloat16(v32bfloat16 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { + return extract_128_512(a, idx); } -#endif +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4float) extract_v4float(v16float a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v16float dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v16float(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); -} - -INTRINSIC(v8bfloat16) -extract_v8bfloat16(v32bfloat16 a, int idx) DIAGNOSE_EXTRACT_IDX(3) { - v32bfloat16 dest = - __builtin_aiev2_vshift_I512_I512(a, undef_v32bfloat16(), 0x0, 16 * idx); - // extract 128-bit from 512-bit dest. - return __builtin_aiev2_extract_I128_I512(dest); + return extract_128_512(a, idx); } -// Set 128-bit portion of 256-bit register +//! @name Set 128-bit portion of 256-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v128uint4) set_v128uint4(int idx, v32uint4 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v128uint4 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v128uint4(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v128int4) set_v128int4(int idx, v32int4 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v128int4 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v128int4(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v64uint8) set_v64uint8(int idx, v16uint8 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v64uint8 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v64uint8(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v64int8) set_v64int8(int idx, v16int8 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v64int8 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v64int8(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v32uint16) set_v32uint16(int idx, v8uint16 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v32uint16 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v32uint16(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v32int16) set_v32int16(int idx, v8int16 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v32int16 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v32int16(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } -#if 0 -INTRINSIC(v16cint16) set_v16cint16(int idx, v4cint16 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v16cint16 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v16cint16(), src, 0x0, (4 - idx) * 16); - } -} -#endif - +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v16uint32) set_v16uint32(int idx, v4uint32 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v16uint32 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v16uint32(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v16int32) set_v16int32(int idx, v4int32 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v16int32 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v16int32(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v32bfloat16) set_v32bfloat16(int idx, v8bfloat16 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v32bfloat16 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v32bfloat16(), src, 0x0, - (4 - idx) * 16); - } -} - -#if 0 -INTRINSIC(v8cint32) set_v8cint32(int idx, v2cint32 a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v8cint32 src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v8cint32(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } -#endif +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v16float) set_v16float(int idx, v4float a) DIAGNOSE_EXTRACT_IDX(3) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_set_I512_I128(a); - else { - // set 512-bit source from 128-bit src - v16float src = __builtin_aiev2_set_I512_I128(a); - return __builtin_aiev2_vshift_I512_I512(undef_v16float(), src, 0x0, - (4 - idx) * 16); - } + return set_128_512(idx, a); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v32uint4) extract_v32uint4(v64uint4 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v32uint4(set_v128uint4(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v32int4) extract_v32int4(v64int4 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v32int4(set_v128int4(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v16uint8) extract_v16uint8(v32uint8 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v16uint8(set_v64uint8(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v16int8) extract_v16int8(v32int8 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v16int8(set_v64int8(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v8uint16) extract_v8uint16(v16uint16 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8uint16(set_v32uint16(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v8int16) extract_v8int16(v16int16 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8int16(set_v32int16(0, a), idx); + return extract_128_256(a, idx); } -#if 0 -// Extract 128-bit portion from 512-bit register -INTRINSIC(v4cint16) extract_v4cint16(v8cint16 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v4cint16(set_v16cint16(0, a), idx); -} -#endif - -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4uint32) extract_v4uint32(v8uint32 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v4uint32(set_v16uint32(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4int32) extract_v4int32(v8int32 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v4int32(set_v16int32(0, a), idx); + return extract_128_256(a, idx); } -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v8bfloat16) extract_v8bfloat16(v16bfloat16 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8bfloat16(set_v32bfloat16(0, a), idx); + return extract_128_256(a, idx); } -#if 0 -// Extract 128-bit portion from 512-bit register -INTRINSIC(v2cint32) extract_v2cint32(v4cint32 a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v2cint32(set_v8cint32(0, a), idx); -} -#endif - -// Extract 128-bit portion from 512-bit register +//! @name Extract 128-bit portion from 256-bit register INTRINSIC(v4float) extract_v4float(v8float a, int idx) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v4float(set_v16float(0, a), idx); + return extract_128_256(a, idx); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v64uint4) set_v64uint4(int idx, v32uint4 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v64uint4(set_v128uint4(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v64int4) set_v64int4(int idx, v32int4 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v64int4(set_v128int4(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v32uint8) set_v32uint8(int idx, v16uint8 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v32uint8(set_v64uint8(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v32int8) set_v32int8(int idx, v16int8 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v32int8(set_v64int8(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v16uint16) set_v16uint16(int idx, v8uint16 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v16uint16(set_v32uint16(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v16int16) set_v16int16(int idx, v8int16 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v16int16(set_v32int16(idx, a), 0); + return set_128_256(idx, a); } -#if 0 -// Set 128-bit portion of 512-bit register -INTRINSIC(v8cint16) set_v8cint16(int idx, v4cint16 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v8cint16(set_v16cint16(idx, a), 0); -} -#endif - -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v8uint32) set_v8uint32(int idx, v4uint32 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v8uint32(set_v16uint32(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v8int32) set_v8int32(int idx, v4int32 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v8int32(set_v16int32(idx, a), 0); + return set_128_256(idx, a); } -// Set 128-bit portion of 512-bit register -INTRINSIC(v16bfloat16) set_v16bfloat16(int idx, v8bfloat16 a) { - return set_v16int16(idx, a); +//! @name Set 128-bit portion of 256-bit register +INTRINSIC(v16bfloat16) +set_v16bfloat16(int idx, v8bfloat16 a) DIAGNOSE_EXTRACT_IDX(1) { + return set_128_256(idx, a); } -#if 0 -// Set 128-bit portion of 512-bit register -INTRINSIC(v4cint32) set_v4cint32(int idx, v2cint32 a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v4cint32(set_v8cint32(idx, a), 0); -} -#endif - -// Set 128-bit portion of 512-bit register +//! @name Set 128-bit portion of 256-bit register INTRINSIC(v8float) set_v8float(int idx, v4float a) DIAGNOSE_EXTRACT_IDX(1) { - if (__builtin_constant_p(idx) && (idx == 0)) - return __builtin_aiev2_get_I256_I128(a); - else - return extract_v8float(set_v16float(idx, a), 0); + return set_128_256(idx, a); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v128uint4) insert(v128uint4 v, int idx, v32uint4 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v128uint4(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v128uint4)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v128int4) insert(v128int4 v, int idx, v32int4 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v128int4(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v128int4)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v64uint8) insert(v64uint8 v, int idx, v16uint8 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v64uint8(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v64uint8)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v64int8) insert(v64int8 v, int idx, v16int8 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v64int8(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v64int8)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v32uint16) insert(v32uint16 v, int idx, v8uint16 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v32uint16(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v32uint16)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v32int16) insert(v32int16 v, int idx, v8int16 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v32int16(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v32int16)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -#if 0 -// Insert 128-bit in 512-bit register -INTRINSIC(v16cint16) insert(v16cint16 v, int idx, v4cint16 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v16cint16(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v16cint16)__builtin_aiev2_vsel32(v, tmp, mask); -} -#endif - -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v16uint32) insert(v16uint32 v, int idx, v4uint32 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v16uint32(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v16uint32)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v16int32) insert(v16int32 v, int idx, v4int32 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v16int32(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v16int32)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v32bfloat16) insert(v32bfloat16 v, int idx, v8bfloat16 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v32bfloat16(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v32bfloat16)__builtin_aiev2_vsel32(v, tmp, mask); -} - -#if 0 -// Insert 128-bit in 512-bit register -INTRINSIC(v8cint32) insert(v8cint32 v, int idx, v2cint32 b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v8cint32(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - - return (v8cint32)__builtin_aiev2_vsel32(v, tmp, mask); + return insert_128_512(v, idx, b); } -#endif - -// Insert 128-bit in 512-bit register -INTRINSIC(v16float) insert(v16float v, int idx, v4float b) DIAGNOSE_EXTRACT_IDX(3) { - v16int32 tmp = (v16int32)set_v16float(idx, b); - - const unsigned mask_elems = 4; - const unsigned mask_base = (1u << mask_elems) - 1u; - const unsigned mask = mask_base << (mask_elems * idx); - return (v16float)__builtin_aiev2_vsel32(v, tmp, mask); +//! @name Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register +INTRINSIC(v16float) +insert(v16float v, int idx, v4float b) DIAGNOSE_EXTRACT_IDX(3) { + return insert_128_512(v, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v64uint4) insert(v64uint4 a, int idx, v32uint4 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v64uint4(insert(set_v128uint4(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v64int4) insert(v64int4 a, int idx, v32int4 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v64int4(insert(set_v128int4(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v32uint8) insert(v32uint8 a, int idx, v16uint8 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v32uint8(insert(set_v64uint8(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v32int8) insert(v32int8 a, int idx, v16int8 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v32int8(insert(set_v64int8(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v16uint16) insert(v16uint16 a, int idx, v8uint16 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v16uint16(insert(set_v32uint16(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v16int16) insert(v16int16 a, int idx, v8int16 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v16int16(insert(set_v32int16(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -#if 0 -// Insert 128-bit in 512-bit register -INTRINSIC(v8cint16) insert(v8cint16 a, int idx, v4cint16 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8cint16(insert(set_v16cint16(0, a), idx, b), 0); -} -#endif - -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v8uint32) insert(v8uint32 a, int idx, v4uint32 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8uint32(insert(set_v16uint32(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v8int32) insert(v8int32 a, int idx, v4int32 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8int32(insert(set_v16int32(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -// Insert 128-bit in 512-bit register +//! @name Insert 128-bit in 512-bit register INTRINSIC(v16bfloat16) insert(v16bfloat16 a, int idx, v8bfloat16 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v16bfloat16(insert(set_v32bfloat16(0, a), idx, b), 0); -} - -#if 0 -// Insert 128-bit in 512-bit register -INTRINSIC(v4cint32) insert(v4cint32 a, int idx, v2cint32 b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v4cint32(insert(set_v8cint32(0, a), idx, b), 0); + return insert_128_256(a, idx, b); } -#endif -// Insert 128-bit in 512-bit register -INTRINSIC(v8float) insert(v8float a, int idx, v4float b) DIAGNOSE_EXTRACT_IDX(1) { - return extract_v8float(insert(set_v16float(0, a), idx, b), 0); +//! @name Insert 128-bit in 512-bit register +INTRINSIC(v8float) +insert(v8float a, int idx, v4float b) DIAGNOSE_EXTRACT_IDX(1) { + return insert_128_256(a, idx, b); } INTRINSIC(v128uint4) concat(v32uint4 v0, v32uint4 v1, v32uint4 v2, v32uint4 v3) { - v128uint4 r = set_v128uint4(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v128int4) concat(v32int4 v0, v32int4 v1, v32int4 v2, v32int4 v3) { - v128int4 r = set_v128int4(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v64uint8) concat(v16uint8 v0, v16uint8 v1, v16uint8 v2, v16uint8 v3) { - v64uint8 r = set_v64uint8(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v64int8) concat(v16int8 v0, v16int8 v1, v16int8 v2, v16int8 v3) { - v64int8 r = set_v64int8(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v32uint16) concat(v8uint16 v0, v8uint16 v1, v8uint16 v2, v8uint16 v3) { - v32uint16 r = set_v32uint16(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; -} -INTRINSIC(v32int16) concat(v8int16 v0, v8int16 v1, v8int16 v2, v8int16 v3) { - v32int16 r = set_v32int16(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } -#if 0 -INTRINSIC(v16cint16) -concat(v4cint16 v0, v4cint16 v1, v4cint16 v2, v4cint16 v3) { - v16cint16 r = set_v16cint16(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; +INTRINSIC(v32int16) concat(v8int16 v0, v8int16 v1, v8int16 v2, v8int16 v3) { + return concat_128_512(v0, v1, v2, v3); } -#endif INTRINSIC(v16uint32) concat(v4uint32 v0, v4uint32 v1, v4uint32 v2, v4uint32 v3) { - v16uint32 r = set_v16uint32(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v16int32) concat(v4int32 v0, v4int32 v1, v4int32 v2, v4int32 v3) { - v16int32 r = set_v16int32(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } + INTRINSIC(v32bfloat16) concat(v8bfloat16 v0, v8bfloat16 v1, v8bfloat16 v2, v8bfloat16 v3) { - v32bfloat16 r = set_v32bfloat16(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } -#if 0 -INTRINSIC(v8cint32) concat(v2cint32 v0, v2cint32 v1, v2cint32 v2, v2cint32 v3) { - v8cint32 r = set_v8cint32(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; -} -#endif - INTRINSIC(v16float) concat(v4float v0, v4float v1, v4float v2, v4float v3) { - v16float r = set_v16float(1, v1); - r = insert(r, 2, v2); - r = insert(r, 3, v3); - r = insert(r, 0, v0); - return r; + return concat_128_512(v0, v1, v2, v3); } INTRINSIC(v64uint4) concat(v32uint4 v0, v32uint4 v1) { - v64uint4 r = set_v64uint4(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v64int4) concat(v32int4 v0, v32int4 v1) { - v64int4 r = set_v64int4(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v32uint8) concat(v16uint8 v0, v16uint8 v1) { - v32uint8 r = set_v32uint8(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v32int8) concat(v16int8 v0, v16int8 v1) { - v32int8 r = set_v32int8(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v16uint16) concat(v8uint16 v0, v8uint16 v1) { - v16uint16 r = set_v16uint16(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v16int16) concat(v8int16 v0, v8int16 v1) { - v16int16 r = set_v16int16(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } -#if 0 -INTRINSIC(v8cint16) concat(v4cint16 v0, v4cint16 v1) { - v8cint16 r = set_v8cint16(1, v1); - r = insert(r, 0, v0); - return r; -} -#endif - INTRINSIC(v8uint32) concat(v4uint32 v0, v4uint32 v1) { - v8uint32 r = set_v8uint32(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v8int32) concat(v4int32 v0, v4int32 v1) { - v8int32 r = set_v8int32(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } + INTRINSIC(v16bfloat16) concat(v8bfloat16 v0, v8bfloat16 v1) { - v16bfloat16 r = set_v16bfloat16(1, v1); - r = insert(r, 0, v0); - return r; + return concat_128_256(v0, v1); } INTRINSIC(v8float) concat(v4float v0, v4float v1) { - v8float r = set_v8float(1, v1); - r = insert(r, 0, v0); - return r; -} - -#if 0 -INTRINSIC(v4cint32) concat(v2cint32 v0, v2cint32 v1) { - v4cint32 r = set_v4cint32(1, v1); - r = insert(r, 0, v0); - return r; -} - -// TODO/FIXME : support sparse data type -// ext_sparse -INTRINSIC(v128uint4) extract_v128uint4(v256uint4_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v64uint8) extract_v64uint8(v128uint8_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32uint16) extract_v32uint16(v64uint16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v128int4) extract_v128int4(v256int4_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v64int8) extract_v64int8(v128int8_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32int16) extract_v32int16(v64int16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32bfloat16) extract_v32bfloat16(v64bfloat16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} - -INTRINSIC(v128uint4) extract_sparse_data(v256uint4_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v64uint8) extract_sparse_data(v128uint8_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32uint16) extract_sparse_data(v64uint16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v128int4) extract_sparse_data(v256int4_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v64int8) extract_sparse_data(v128int8_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32int16) extract_sparse_data(v64int16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} -INTRINSIC(v32bfloat16) extract_sparse_data(v64bfloat16_sparse v) { - return __builtin_aiev2_ext_qx(v); -} - -INTRINSIC(sparsity_t) extract_sparsity(v256uint4_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v128uint8_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v64uint16_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v256int4_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v128int8_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v64int16_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} -INTRINSIC(sparsity_t) extract_sparsity(v64bfloat16_sparse v) { - return __builtin_aiev2_ext_qxm(v); -} - -// ext_update_sparse - -INTRINSIC(v256uint4_sparse) update(v256uint4_sparse m, v128uint4 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v128uint8_sparse) update(v128uint8_sparse m, v64uint8 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v64uint16_sparse) update(v64uint16_sparse m, v32uint16 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v256int4_sparse) update(v256int4_sparse m, v128int4 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v128int8_sparse) update(v128int8_sparse m, v64int8 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v64int16_sparse) update(v64int16_sparse m, v32int16 v) { - return __builtin_aiev2_upd_qx(m, v); -} -INTRINSIC(v64bfloat16_sparse) update(v64bfloat16_sparse m, v32bfloat16 v) { - return __builtin_aiev2_upd_qx(m, v); -} - -INTRINSIC(v256uint4_sparse) update(v256uint4_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v128uint8_sparse) update(v128uint8_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v64uint16_sparse) update(v64uint16_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v256int4_sparse) update(v256int4_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v128int8_sparse) update(v128int8_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v64int16_sparse) update(v64int16_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -INTRINSIC(v64bfloat16_sparse) update(v64bfloat16_sparse m, sparsity_t v) { - return __builtin_aiev2_upd_qxm(m, v); -} -#endif - -// ext_elem -INTRINSIC(int) extract_elem(v2int4 v, int idx) { return ext_v2w4(v, idx); } -INTRINSIC(int) extract_elem(v4int4 v, int idx) { - return ext_v4w4((short)v, idx); -} -INTRINSIC(int) extract_elem(v8int4 v, int idx) { return ext_v8w4((int)v, idx); } -INTRINSIC(int) extract_elem(v16int4 v, int idx) { return ext_v16w4(v, idx); } -INTRINSIC(int) extract_elem(v2int8 v, int idx) { - return ext_v2w8((short)v, idx); -} -INTRINSIC(int) extract_elem(v4int8 v, int idx) { return ext_v4w8((int)v, idx); } -INTRINSIC(int) extract_elem(v8int8 v, int idx) { return ext_v8w8(v, idx); } -INTRINSIC(int) extract_elem(v2int16 v, int idx) { - return ext_v2w16((int)v, idx); -} -INTRINSIC(int) extract_elem(v2int32 v, int idx) { return ext_v2w32(v, idx); } -INTRINSIC(int) extract_elem(v4int16 v, int idx) { return ext_v4w16(v, idx); } -INTRINSIC(unsigned int) extract_elem(v2uint4 v, int idx) { - return ext_v2u4(v, idx); -} -INTRINSIC(unsigned int) extract_elem(v4uint4 v, int idx) { - return ext_v4u4((unsigned short)v, idx); -} -INTRINSIC(unsigned int) extract_elem(v8uint4 v, int idx) { - return ext_v8u4((unsigned int)v, idx); -} -INTRINSIC(unsigned int) extract_elem(v16uint4 v, int idx) { - return ext_v16u4(v, idx); -} -INTRINSIC(unsigned int) extract_elem(v2uint8 v, int idx) { - return ext_v2u8((unsigned short)v, idx); -} -INTRINSIC(unsigned int) extract_elem(v4uint8 v, int idx) { - return ext_v4u8((unsigned int)v, idx); -} -INTRINSIC(unsigned int) extract_elem(v8uint8 v, int idx) { - return ext_v8u8(v, idx); -} -INTRINSIC(unsigned int) extract_elem(v2uint16 v, int idx) { - return ext_v2u16((unsigned int)v, idx); -} -INTRINSIC(unsigned int) extract_elem(v4uint16 v, int idx) { - return ext_v4u16(v, idx); -} -INTRINSIC(unsigned int) extract_elem(v2uint32 v, int idx) { - return ext_v2u32(v, idx); -} - -// insert_elem -INTRINSIC(v2int4) insert(v2int4 v, int idx, int val) { - return upd_v2w4(v, idx, val); -} -INTRINSIC(v4int4) insert(v4int4 v, int idx, int val) { - return (v4int4)(short)upd_v4w4((short)v, idx, val); -} -INTRINSIC(v8int4) insert(v8int4 v, int idx, int val) { - return (v8int4)upd_v8w4((int)v, idx, val); -} -INTRINSIC(v16int4) insert(v16int4 v, int idx, int val) { - return upd_v16w4(v, idx, val); -} -INTRINSIC(v2int8) insert(v2int8 v, int idx, int val) { - return (v2int8)(short)upd_v2w8((short)v, idx, val); -} -INTRINSIC(v4int8) insert(v4int8 v, int idx, int val) { - return (v4int8)upd_v4w8((int)v, idx, val); -} -INTRINSIC(v8int8) insert(v8int8 v, int idx, int val) { - return upd_v8w8(v, idx, val); -} -INTRINSIC(v2int16) insert(v2int16 v, int idx, int val) { - return (v2int16)upd_v2w16((int)v, idx, val); -} -INTRINSIC(v4int16) insert(v4int16 v, int idx, int val) { - return upd_v4w16(v, idx, val); -} -INTRINSIC(v2int32) insert(v2int32 v, int idx, int val) { - return upd_v2w32(v, idx, val); -} -INTRINSIC(v2uint4) insert(v2uint4 v, int idx, unsigned int val) { - return upd_v2w4(v, idx, val); -} -INTRINSIC(v4uint4) insert(v4uint4 v, int idx, unsigned int val) { - return (v4uint4)(short)upd_v4w4((unsigned short)v, idx, val); -} -INTRINSIC(v8uint4) insert(v8uint4 v, int idx, unsigned int val) { - return (v8uint4)upd_v8w4((unsigned int)v, idx, val); -} -INTRINSIC(v16uint4) insert(v16uint4 v, int idx, unsigned int val) { - return upd_v16w4(v, idx, val); -} -INTRINSIC(v2uint8) insert(v2uint8 v, int idx, unsigned int val) { - return (v2uint8)(unsigned short)upd_v2w8((unsigned short)v, idx, val); -} -INTRINSIC(v4uint8) insert(v4uint8 v, int idx, unsigned int val) { - return (v4uint8)upd_v4w8((unsigned int)v, idx, val); -} -INTRINSIC(v8uint8) insert(v8uint8 v, int idx, unsigned int val) { - return upd_v8w8(v, idx, val); -} -INTRINSIC(v2uint16) insert(v2uint16 v, int idx, unsigned int val) { - return (v2uint16)upd_v2w16((unsigned int)v, idx, val); -} -INTRINSIC(v4uint16) insert(v4uint16 v, int idx, unsigned int val) { - return upd_v4w16(v, idx, val); -} -INTRINSIC(v2uint32) insert(v2uint32 v, int idx, unsigned int val) { - return upd_v2w32(v, idx, val); -} - -// set_elem -INTRINSIC(v2int4) set_v2int4(int idx, int val) { return set_v2w4(idx, val); } -INTRINSIC(v4int4) set_v4int4(int idx, int val) { - return (v4int4)(short)set_v4w4(idx, val); -} -INTRINSIC(v8int4) set_v8int4(int idx, int val) { - return (v8int4)set_v8w4(idx, val); -} -INTRINSIC(v16int4) set_v16int4(int idx, int val) { return set_v16w4(idx, val); } - -INTRINSIC(v2int8) set_v2int8(int idx, int val) { - return (v2int8)(short)set_v2w8(idx, val); -} -INTRINSIC(v4int8) set_v4int8(int idx, int val) { - return (v4int8)set_v4w8(idx, val); -} -INTRINSIC(v8int8) set_v8int8(int idx, int val) { return set_v8w8(idx, val); } - -INTRINSIC(v2int16) set_v2int16(int idx, int val) { - return (v2int16)set_v2w16(idx, val); -} -INTRINSIC(v4int16) set_v4int16(int idx, int val) { return set_v4w16(idx, val); } - -INTRINSIC(v2int32) set_v2int32(int idx, int val) { return set_v2w32(idx, val); } - -INTRINSIC(v2uint4) set_v2uint4(int idx, unsigned int val) { - return set_v2w4(idx, val); -} -INTRINSIC(v4uint4) set_v4uint4(int idx, unsigned int val) { - return (v4uint4)(short)set_v4w4(idx, val); -} -INTRINSIC(v8uint4) set_v8uint4(int idx, unsigned int val) { - return (v8uint4)set_v8w4(idx, val); -} -INTRINSIC(v16uint4) set_v16uint4(int idx, unsigned int val) { - return set_v16w4(idx, val); -} - -INTRINSIC(v2uint8) set_v2uint8(int idx, unsigned int val) { - return (v2uint8)(short)set_v2w8(idx, val); -} -INTRINSIC(v4uint8) set_v4uint8(int idx, unsigned int val) { - return (v4uint8)set_v4w8(idx, val); -} -INTRINSIC(v8uint8) set_v8uint8(int idx, unsigned int val) { - return set_v8w8(idx, val); -} - -INTRINSIC(v2uint16) set_v2uint16(int idx, unsigned int val) { - return (v2uint16)set_v2w16(idx, val); -} -INTRINSIC(v4uint16) set_v4uint16(int idx, unsigned int val) { - return set_v4w16(idx, val); -} - -INTRINSIC(v2uint32) set_v2uint32(int idx, unsigned int val) { - return set_v2w32(idx, val); + return concat_128_256(v0, v1); } -#endif // __AIEV2_UPD_EXT_H__ +#endif // __AIEV2_UPD_EXT_H__ \ No newline at end of file diff --git a/cross-project-tests/intrinsic-header-tests/aiev2-upd-ext-intrinsic.cpp b/cross-project-tests/intrinsic-header-tests/aiev2-upd-ext-intrinsic.cpp new file mode 100644 index 000000000000..07eafe43fb3f --- /dev/null +++ b/cross-project-tests/intrinsic-header-tests/aiev2-upd-ext-intrinsic.cpp @@ -0,0 +1,3630 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +//===- aie2-upd-ext-intrinsic.cpp -------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +// RUN: %clang -O2 %s --target=aie2 -S -emit-llvm -o - | FileCheck %s +// Conversions + +// v128uint4 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z21test_extract_v64uint4Dv64_DU8_i( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV64_DU8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV64_DU8_I_EXIT]] +// CHECK: _ZL16extract_v64uint4Dv64_DU8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP2]] +// +v64uint4 test_extract_v64uint4(v128uint4 a, int idx) { + return extract_v64uint4(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_insertDv64_DU8_iDv32_S_( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_DU8_IDV32_S__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_DU8_IDV32_S__EXIT]] +// CHECK: _ZL6insertDv64_DU8_iDv32_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP3]] +// +v128uint4 test_insert(v128uint4 a, int idx, v64uint4 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z18test_set_v128uint4iDv32_DU8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT4IDV32_DU8__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT4IDV32_DU8__EXIT]] +// CHECK: _ZL13set_v128uint4iDv32_DU8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128uint4 test_set_v128uint4(int idx, v64uint4 b) { + return set_v128uint4(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_concatDv32_DU8_S0_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128uint4 test_concat(v64uint4 a0, v64uint4 a1) { return concat(a0, a1); } + +// v128int4 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z20test_extract_v64int4Dv64_DB8_i( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV64_DB8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV64_DB8_I_EXIT]] +// CHECK: _ZL15extract_v64int4Dv64_DB8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP2]] +// +v64int4 test_extract_v64int4(v128int4 a, int idx) { + return extract_v64int4(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_insertDv64_DB8_iDv32_S_( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_DB8_IDV32_S__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_DB8_IDV32_S__EXIT]] +// CHECK: _ZL6insertDv64_DB8_iDv32_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP3]] +// +v128int4 test_insert(v128int4 a, int idx, v64int4 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z17test_set_v128int4iDv32_DB8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT4IDV32_DB8__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT4IDV32_DB8__EXIT]] +// CHECK: _ZL12set_v128int4iDv32_DB8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128int4 test_set_v128int4(int idx, v64int4 b) { return set_v128int4(idx, b); } + +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_concatDv32_DB8_S0_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128int4 test_concat(v64int4 a0, v64int4 a1) { return concat(a0, a1); } + +// v64uint8 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z21test_extract_v32uint8Dv64_hi( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV64_HI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV64_HI_EXIT]] +// CHECK: _ZL16extract_v32uint8Dv64_hi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP2]] +// +v32uint8 test_extract_v32uint8(v64uint8 a, int idx) { + return extract_v32uint8(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_insertDv64_hiDv32_h( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_HIDV32_H_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_HIDV32_H_EXIT]] +// CHECK: _ZL6insertDv64_hiDv32_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP3]] +// +v64uint8 test_insert(v64uint8 a, int idx, v32uint8 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z17test_set_v64uint8iDv32_h( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64UINT8IDV32_H_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64UINT8IDV32_H_EXIT]] +// CHECK: _ZL12set_v64uint8iDv32_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64uint8 test_set_v64uint8(int idx, v32uint8 b) { return set_v64uint8(idx, b); } + +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_concatDv32_hS_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64uint8 test_concat(v32uint8 a0, v32uint8 a1) { return concat(a0, a1); } + +// v64int8 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z20test_extract_v32int8Dv64_ai( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV64_AI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV64_AI_EXIT]] +// CHECK: _ZL15extract_v32int8Dv64_ai.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP2]] +// +v32int8 test_extract_v32int8(v64int8 a, int idx) { + return extract_v32int8(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_insertDv64_aiDv32_a( +// CHECK-SAME: <64 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_AIDV32_A_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_AIDV32_A_EXIT]] +// CHECK: _ZL6insertDv64_aiDv32_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP3]] +// +v64int8 test_insert(v64int8 a, int idx, v32int8 b) { return insert(a, idx, b); } + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z16test_set_v64int8iDv32_a( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL11SET_V64INT8IDV32_A_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL11SET_V64INT8IDV32_A_EXIT]] +// CHECK: _ZL11set_v64int8iDv32_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64int8 test_set_v64int8(int idx, v32int8 b) { return set_v64int8(idx, b); } + +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z11test_concatDv32_aS_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64int8 test_concat(v32int8 a0, v32int8 a1) { return concat(a0, a1); } + +// v32uint16 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i16> @_Z22test_extract_v16uint16Dv32_ti( +// CHECK-SAME: <32 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV32_TI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV32_TI_EXIT]] +// CHECK: _ZL17extract_v16uint16Dv32_ti.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x i16> +// CHECK-NEXT: ret <16 x i16> [[TMP2]] +// +v16uint16 test_extract_v16uint16(v32uint16 a, int idx) { + return extract_v16uint16(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z11test_insertDv32_tiDv16_t( +// CHECK-SAME: <32 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_TIDV16_T_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_TIDV16_T_EXIT]] +// CHECK: _ZL6insertDv32_tiDv16_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP3]] +// +v32uint16 test_insert(v32uint16 a, int idx, v16uint16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z18test_set_v32uint16iDv16_t( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT16IDV16_T_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT16IDV16_T_EXIT]] +// CHECK: _ZL13set_v32uint16iDv16_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32uint16 test_set_v32uint16(int idx, v16uint16 b) { + return set_v32uint16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z11test_concatDv16_tS_( +// CHECK-SAME: <16 x i16> noundef [[A0:%.*]], <16 x i16> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32uint16 test_concat(v16uint16 a0, v16uint16 a1) { return concat(a0, a1); } + +// v32int16 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i16> @_Z21test_extract_v16int16Dv32_si( +// CHECK-SAME: <32 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV32_SI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV32_SI_EXIT]] +// CHECK: _ZL16extract_v16int16Dv32_si.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x i16> +// CHECK-NEXT: ret <16 x i16> [[TMP2]] +// +v16int16 test_extract_v16int16(v32int16 a, int idx) { + return extract_v16int16(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z11test_insertDv32_siDv16_s( +// CHECK-SAME: <32 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_SIDV16_S_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_SIDV16_S_EXIT]] +// CHECK: _ZL6insertDv32_siDv16_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP3]] +// +v32int16 test_insert(v32int16 a, int idx, v16int16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z17test_set_v32int16iDv16_s( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT16IDV16_S_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT16IDV16_S_EXIT]] +// CHECK: _ZL12set_v32int16iDv16_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32int16 test_set_v32int16(int idx, v16int16 b) { return set_v32int16(idx, b); } + +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z11test_concatDv16_sS_( +// CHECK-SAME: <16 x i16> noundef [[A0:%.*]], <16 x i16> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32int16 test_concat(v16int16 a0, v16int16 a1) { return concat(a0, a1); } + +// v16uint32 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i32> @_Z21test_extract_v8uint32Dv16_ji( +// CHECK-SAME: <16 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV16_JI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV16_JI_EXIT]] +// CHECK: _ZL16extract_v8uint32Dv16_ji.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <8 x i32> [[RETVAL_0_I_I]] +// +v8uint32 test_extract_v8uint32(v16uint32 a, int idx) { + return extract_v8uint32(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z11test_insertDv16_jiDv8_j( +// CHECK-SAME: <16 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[A]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_JIDV8_J_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[A]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_JIDV8_J_EXIT]] +// CHECK: _ZL6insertDv16_jiDv8_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16uint32 test_insert(v16uint32 a, int idx, v8uint32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z18test_set_v16uint32iDv8_j( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V16UINT32IDV8_J_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V16UINT32IDV8_J_EXIT]] +// CHECK: _ZL13set_v16uint32iDv8_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16uint32 test_set_v16uint32(int idx, v8uint32 b) { + return set_v16uint32(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z11test_concatDv8_jS_( +// CHECK-SAME: <8 x i32> noundef [[A0:%.*]], <8 x i32> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x i32> [[SHUFFLE_I_I]] +// +v16uint32 test_concat(v8uint32 a0, v8uint32 a1) { return concat(a0, a1); } + +// v16int32 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i32> @_Z20test_extract_v8int32Dv16_ii( +// CHECK-SAME: <16 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV16_II_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV16_II_EXIT]] +// CHECK: _ZL15extract_v8int32Dv16_ii.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <8 x i32> [[RETVAL_0_I_I]] +// +v8int32 test_extract_v8int32(v16int32 a, int idx) { + return extract_v8int32(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z11test_insertDv16_iiDv8_i( +// CHECK-SAME: <16 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[A]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_IIDV8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[A]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_IIDV8_I_EXIT]] +// CHECK: _ZL6insertDv16_iiDv8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16int32 test_insert(v16int32 a, int idx, v8int32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z17test_set_v16int32iDv8_i( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16INT32IDV8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16INT32IDV8_I_EXIT]] +// CHECK: _ZL12set_v16int32iDv8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16int32 test_set_v16int32(int idx, v8int32 b) { return set_v16int32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z11test_concatDv8_iS_( +// CHECK-SAME: <8 x i32> noundef [[A0:%.*]], <8 x i32> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x i32> [[SHUFFLE_I_I]] +// +v16int32 test_concat(v8int32 a0, v8int32 a1) { return concat(a0, a1); } + +// v32bfloat16 + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x bfloat> @_Z24test_extract_v16bfloat16Dv32_u6__bf16i( +// CHECK-SAME: <32 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x bfloat> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV32_U6__BF16I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV32_U6__BF16I_EXIT]] +// CHECK: _ZL19extract_v16bfloat16Dv32_u6__bf16i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x bfloat> +// CHECK-NEXT: ret <16 x bfloat> [[TMP2]] +// +v16bfloat16 test_extract_v16bfloat16(v32bfloat16 a, int idx) { + return extract_v16bfloat16(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x bfloat> @_Z11test_insertDv32_u6__bf16iDv16_u6__bf16( +// CHECK-SAME: <32 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x bfloat> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x bfloat> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U6__BF16IDV16_U6__BF16_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U6__BF16IDV16_U6__BF16_EXIT]] +// CHECK: _ZL6insertDv32_u6__bf16iDv16_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x bfloat> +// CHECK-NEXT: ret <32 x bfloat> [[TMP3]] +// +v32bfloat16 test_insert(v32bfloat16 a, int idx, v16bfloat16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <32 x bfloat> @_Z20test_set_v32bfloat16iDv16_u6__bf16( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x bfloat> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32BFLOAT16IDV16_U6__BF16_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32BFLOAT16IDV16_U6__BF16_EXIT]] +// CHECK: _ZL15set_v32bfloat16iDv16_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x bfloat> +// CHECK-NEXT: ret <32 x bfloat> [[TMP2]] +// +v32bfloat16 test_set_v32bfloat16(int idx, v16bfloat16 b) { + return set_v32bfloat16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <32 x bfloat> @_Z11test_concatDv16_u6__bf16S_( +// CHECK-SAME: <16 x bfloat> noundef [[A0:%.*]], <16 x bfloat> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x bfloat> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x bfloat> [[A1]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[SHUFFLE_I_I]] to <32 x bfloat> +// CHECK-NEXT: ret <32 x bfloat> [[TMP2]] +// +v32bfloat16 test_concat(v16bfloat16 a0, v16bfloat16 a1) { + return concat(a0, a1); +} + +// v16float + +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x float> @_Z20test_extract_v8floatDv16_fi( +// CHECK-SAME: <16 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV16_FI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV16_FI_EXIT]] +// CHECK: _ZL15extract_v8floatDv16_fi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <8 x float> +// CHECK-NEXT: ret <8 x float> [[TMP2]] +// +v8float test_extract_v8float(v16float a, int idx) { + return extract_v8float(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x float> @_Z11test_insertDv16_fiDv8_f( +// CHECK-SAME: <16 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_FIDV8_F_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_FIDV8_F_EXIT]] +// CHECK: _ZL6insertDv16_fiDv8_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <16 x float> +// CHECK-NEXT: ret <16 x float> [[TMP3]] +// +v16float test_insert(v16float a, int idx, v8float b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <16 x float> @_Z17test_set_v16floatiDv8_f( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16FLOATIDV8_F_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16FLOATIDV8_F_EXIT]] +// CHECK: _ZL12set_v16floatiDv8_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <16 x float> +// CHECK-NEXT: ret <16 x float> [[TMP2]] +// +v16float test_set_v16float(int idx, v8float b) { return set_v16float(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x float> @_Z11test_concatDv8_fS_( +// CHECK-SAME: <8 x float> noundef [[A0:%.*]], <8 x float> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_UNCASTED_I:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x float> [[SHUFFLE_I_UNCASTED_I]] +// +v16float test_concat(v8float a0, v8float a1) { return concat(a0, a1); } + +// v256uint4 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z21test_extract_v64uint4Dv128_DU8_i( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV128_DU8_I_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV128_DU8_I_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV128_DU8_I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT4DV128_DU8_I_EXIT]] +// CHECK: _ZL16extract_v64uint4Dv128_DU8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP1]] +// +v64uint4 test_extract_v64uint4(v256uint4 a, int idx) { + return extract_v64uint4(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_DU8_iDv32_S_( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV32_S__EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV32_S__EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV32_S__EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV32_S__EXIT]] +// CHECK: _ZL6insertDv128_DU8_iDv32_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256uint4 test_insert(v256uint4 a, int idx, v64uint4 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z18test_set_v256uint4iDv32_DU8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV32_DU8__EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV32_DU8__EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV32_DU8__EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV32_DU8__EXIT]] +// CHECK: _ZL13set_v256uint4iDv32_DU8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP1]] +// +v256uint4 test_set_v256uint4(int idx, v64uint4 b) { + return set_v256uint4(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv32_DU8_S0_S0_S0_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]], <32 x i8> noundef [[A2:%.*]], <32 x i8> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i8> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i8> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP4]] +// +v256uint4 test_concat(v64uint4 a0, v64uint4 a1, v64uint4 a2, v64uint4 a3) { + return concat(a0, a1, a2, a3); +} + +// v256uint4 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z22test_extract_v128uint4Dv128_DU8_i( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V128UINT4DV128_DU8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V128UINT4DV128_DU8_I_EXIT]] +// CHECK: _ZL17extract_v128uint4Dv128_DU8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128uint4 test_extract_v128uint4(v256uint4 a, int idx) { + return extract_v128uint4(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_DU8_iDv64_S_( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV64_S__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DU8_IDV64_S__EXIT]] +// CHECK: _ZL6insertDv128_DU8_iDv64_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP3]] +// +v256uint4 test_insert(v256uint4 a, int idx, v128uint4 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z18test_set_v256uint4iDv64_DU8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV64_DU8__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V256UINT4IDV64_DU8__EXIT]] +// CHECK: _ZL13set_v256uint4iDv64_DU8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256uint4 test_set_v256uint4(int idx, v128uint4 b) { + return set_v256uint4(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv64_DU8_S0_( +// CHECK-SAME: <64 x i8> noundef [[A0:%.*]], <64 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256uint4 test_concat(v128uint4 a0, v128uint4 a1) { return concat(a0, a1); } + +// v256int4 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z20test_extract_v64int4Dv128_DB8_i( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV128_DB8_I_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV128_DB8_I_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV128_DB8_I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT4DV128_DB8_I_EXIT]] +// CHECK: _ZL15extract_v64int4Dv128_DB8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP1]] +// +v64int4 test_extract_v64int4(v256int4 a, int idx) { + return extract_v64int4(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_DB8_iDv32_S_( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV32_S__EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV32_S__EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV32_S__EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV32_S__EXIT]] +// CHECK: _ZL6insertDv128_DB8_iDv32_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256int4 test_insert(v256int4 a, int idx, v64int4 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z17test_set_v256int4iDv32_DB8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV32_DB8__EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV32_DB8__EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV32_DB8__EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV32_DB8__EXIT]] +// CHECK: _ZL12set_v256int4iDv32_DB8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP1]] +// +v256int4 test_set_v256int4(int idx, v64int4 b) { return set_v256int4(idx, b); } + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv32_DB8_S0_S0_S0_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]], <32 x i8> noundef [[A2:%.*]], <32 x i8> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i8> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i8> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP4]] +// +v256int4 test_concat(v64int4 a0, v64int4 a1, v64int4 a2, v64int4 a3) { + return concat(a0, a1, a2, a3); +} + +// v256int4 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z21test_extract_v128int4Dv128_DB8_i( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V128INT4DV128_DB8_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V128INT4DV128_DB8_I_EXIT]] +// CHECK: _ZL16extract_v128int4Dv128_DB8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v128int4 test_extract_v128int4(v256int4 a, int idx) { + return extract_v128int4(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_DB8_iDv64_S_( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV64_S__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_DB8_IDV64_S__EXIT]] +// CHECK: _ZL6insertDv128_DB8_iDv64_S_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP3]] +// +v256int4 test_insert(v256int4 a, int idx, v128int4 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z17test_set_v256int4iDv64_DB8_( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV64_DB8__EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V256INT4IDV64_DB8__EXIT]] +// CHECK: _ZL12set_v256int4iDv64_DB8_.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256int4 test_set_v256int4(int idx, v128int4 b) { return set_v256int4(idx, b); } + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv64_DB8_S0_( +// CHECK-SAME: <64 x i8> noundef [[A0:%.*]], <64 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v256int4 test_concat(v128int4 a0, v128int4 a1) { return concat(a0, a1); } + +// v128uint8 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z21test_extract_v32uint8Dv128_hi( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV128_HI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV128_HI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV128_HI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32UINT8DV128_HI_EXIT]] +// CHECK: _ZL16extract_v32uint8Dv128_hi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP1]] +// +v32uint8 test_extract_v32uint8(v128uint8 a, int idx) { + return extract_v32uint8(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_hiDv32_h( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV32_H_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV32_H_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV32_H_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV32_H_EXIT]] +// CHECK: _ZL6insertDv128_hiDv32_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128uint8 test_insert(v128uint8 a, int idx, v32uint8 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z18test_set_v128uint8iDv32_h( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV32_H_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV32_H_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV32_H_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV32_H_EXIT]] +// CHECK: _ZL13set_v128uint8iDv32_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP1]] +// +v128uint8 test_set_v128uint8(int idx, v32uint8 b) { + return set_v128uint8(idx, b); +} + +// +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv32_hS_S_S_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]], <32 x i8> noundef [[A2:%.*]], <32 x i8> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i8> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i8> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP4]] +// +v128uint8 test_concat(v32uint8 a0, v32uint8 a1, v32uint8 a2, v32uint8 a3) { + return concat(a0, a1, a2, a3); +} + +// v128uint8 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z21test_extract_v64uint8Dv128_hi( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT8DV128_HI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V64UINT8DV128_HI_EXIT]] +// CHECK: _ZL16extract_v64uint8Dv128_hi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64uint8 test_extract_v64uint8(v128uint8 a, int idx) { + return extract_v64uint8(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_hiDv64_h( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV64_H_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_HIDV64_H_EXIT]] +// CHECK: _ZL6insertDv128_hiDv64_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP3]] +// +v128uint8 test_insert(v128uint8 a, int idx, v64uint8 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z18test_set_v128uint8iDv64_h( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV64_H_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V128UINT8IDV64_H_EXIT]] +// CHECK: _ZL13set_v128uint8iDv64_h.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128uint8 test_set_v128uint8(int idx, v64uint8 b) { + return set_v128uint8(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv64_hS_( +// CHECK-SAME: <64 x i8> noundef [[A0:%.*]], <64 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128uint8 test_concat(v64uint8 a0, v64uint8 a1) { return concat(a0, a1); } + +// v128int8 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i8> @_Z20test_extract_v32int8Dv128_ai( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV128_AI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV128_AI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV128_AI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V32INT8DV128_AI_EXIT]] +// CHECK: _ZL15extract_v32int8Dv128_ai.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <32 x i8> +// CHECK-NEXT: ret <32 x i8> [[TMP1]] +// +v32int8 test_extract_v32int8(v128int8 a, int idx) { + return extract_v32int8(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_aiDv32_a( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV32_A_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV32_A_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV32_A_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV32_A_EXIT]] +// CHECK: _ZL6insertDv128_aiDv32_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128int8 test_insert(v128int8 a, int idx, v32int8 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z17test_set_v128int8iDv32_a( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV32_A_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV32_A_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV32_A_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV32_A_EXIT]] +// CHECK: _ZL12set_v128int8iDv32_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP1]] +// +v128int8 test_set_v128int8(int idx, v32int8 b) { return set_v128int8(idx, b); } + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv32_aS_S_S_( +// CHECK-SAME: <32 x i8> noundef [[A0:%.*]], <32 x i8> noundef [[A1:%.*]], <32 x i8> noundef [[A2:%.*]], <32 x i8> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i8> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i8> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i8> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP4]] +// +v128int8 test_concat(v32int8 a0, v32int8 a1, v32int8 a2, v32int8 a3) { + return concat(a0, a1, a2, a3); +} + +// v128int8 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i8> @_Z20test_extract_v64int8Dv128_ai( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT8DV128_AI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V64INT8DV128_AI_EXIT]] +// CHECK: _ZL15extract_v64int8Dv128_ai.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <64 x i8> +// CHECK-NEXT: ret <64 x i8> [[TMP2]] +// +v64int8 test_extract_v64int8(v128int8 a, int idx) { + return extract_v64int8(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_insertDv128_aiDv64_a( +// CHECK-SAME: <128 x i8> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <128 x i8> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV64_A_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV128_AIDV64_A_EXIT]] +// CHECK: _ZL6insertDv128_aiDv64_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP3]] +// +v128int8 test_insert(v128int8 a, int idx, v64int8 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z17test_set_v128int8iDv64_a( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <64 x i8> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV64_A_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V128INT8IDV64_A_EXIT]] +// CHECK: _ZL12set_v128int8iDv64_a.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128int8 test_set_v128int8(int idx, v64int8 b) { return set_v128int8(idx, b); } + +// CHECK-LABEL: define dso_local noundef <128 x i8> @_Z11test_concatDv64_aS_( +// CHECK-SAME: <64 x i8> noundef [[A0:%.*]], <64 x i8> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i8> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <128 x i8> +// CHECK-NEXT: ret <128 x i8> [[TMP2]] +// +v128int8 test_concat(v64int8 a0, v64int8 a1) { return concat(a0, a1); } + +// v64uint16 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i16> @_Z22test_extract_v16uint16Dv64_ti( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV64_TI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV64_TI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV64_TI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT16DV64_TI_EXIT]] +// CHECK: _ZL17extract_v16uint16Dv64_ti.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x i16> +// CHECK-NEXT: ret <16 x i16> [[TMP1]] +// +v16uint16 test_extract_v16uint16(v64uint16 a, int idx) { + return extract_v16uint16(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_insertDv64_tiDv16_t( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV16_T_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV16_T_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV16_T_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV16_T_EXIT]] +// CHECK: _ZL6insertDv64_tiDv16_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64uint16 test_insert(v64uint16 a, int idx, v16uint16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z18test_set_v64uint16iDv16_t( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV16_T_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV16_T_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV16_T_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV16_T_EXIT]] +// CHECK: _ZL13set_v64uint16iDv16_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP1]] +// +v64uint16 test_set_v64uint16(int idx, v16uint16 b) { + return set_v64uint16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_concatDv16_tS_S_S_( +// CHECK-SAME: <16 x i16> noundef [[A0:%.*]], <16 x i16> noundef [[A1:%.*]], <16 x i16> noundef [[A2:%.*]], <16 x i16> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP4]] +// +v64uint16 test_concat(v16uint16 a0, v16uint16 a1, v16uint16 a2, v16uint16 a3) { + return concat(a0, a1, a2, a3); +} + +// v64uint16 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z22test_extract_v32uint16Dv64_ti( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V32UINT16DV64_TI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V32UINT16DV64_TI_EXIT]] +// CHECK: _ZL17extract_v32uint16Dv64_ti.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32uint16 test_extract_v32uint16(v64uint16 a, int idx) { + return extract_v32uint16(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_insertDv64_tiDv32_t( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i16> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV32_T_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_TIDV32_T_EXIT]] +// CHECK: _ZL6insertDv64_tiDv32_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP3]] +// +v64uint16 test_insert(v64uint16 a, int idx, v32uint16 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z18test_set_v64uint16iDv32_t( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV32_T_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V64UINT16IDV32_T_EXIT]] +// CHECK: _ZL13set_v64uint16iDv32_t.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64uint16 test_set_v64uint16(int idx, v32uint16 b) { + return set_v64uint16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_concatDv32_tS_( +// CHECK-SAME: <32 x i16> noundef [[A0:%.*]], <32 x i16> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i16> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64uint16 test_concat(v32uint16 a0, v32uint16 a1) { return concat(a0, a1); } + +// v64int16 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i16> @_Z21test_extract_v16int16Dv64_si( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV64_SI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV64_SI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV64_SI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT16DV64_SI_EXIT]] +// CHECK: _ZL16extract_v16int16Dv64_si.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x i16> +// CHECK-NEXT: ret <16 x i16> [[TMP1]] +// +v16int16 test_extract_v16int16(v64int16 a, int idx) { + return extract_v16int16(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_insertDv64_siDv16_s( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV16_S_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV16_S_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV16_S_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV16_S_EXIT]] +// CHECK: _ZL6insertDv64_siDv16_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64int16 test_insert(v64int16 a, int idx, v16int16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z17test_set_v64int16iDv16_s( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV16_S_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV16_S_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV16_S_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV16_S_EXIT]] +// CHECK: _ZL12set_v64int16iDv16_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP1]] +// +v64int16 test_set_v64int16(int idx, v16int16 b) { return set_v64int16(idx, b); } + +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_concatDv16_sS_S_S_( +// CHECK-SAME: <16 x i16> noundef [[A0:%.*]], <16 x i16> noundef [[A1:%.*]], <16 x i16> noundef [[A2:%.*]], <16 x i16> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i16> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i16> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i16> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i16> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP4]] +// +v64int16 test_concat(v16int16 a0, v16int16 a1, v16int16 a2, v16int16 a3) { + return concat(a0, a1, a2, a3); +} + +// v64int16 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i16> @_Z21test_extract_v32int16Dv64_si( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32INT16DV64_SI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V32INT16DV64_SI_EXIT]] +// CHECK: _ZL16extract_v32int16Dv64_si.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x i16> +// CHECK-NEXT: ret <32 x i16> [[TMP2]] +// +v32int16 test_extract_v32int16(v64int16 a, int idx) { + return extract_v32int16(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_insertDv64_siDv32_s( +// CHECK-SAME: <64 x i16> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x i16> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i16> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV32_S_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_SIDV32_S_EXIT]] +// CHECK: _ZL6insertDv64_siDv32_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP3]] +// +v64int16 test_insert(v64int16 a, int idx, v32int16 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z17test_set_v64int16iDv32_s( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x i16> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV32_S_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V64INT16IDV32_S_EXIT]] +// CHECK: _ZL12set_v64int16iDv32_s.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64int16 test_set_v64int16(int idx, v32int16 b) { return set_v64int16(idx, b); } + +// CHECK-LABEL: define dso_local noundef <64 x i16> @_Z11test_concatDv32_sS_( +// CHECK-SAME: <32 x i16> noundef [[A0:%.*]], <32 x i16> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x i16> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i16> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <64 x i16> +// CHECK-NEXT: ret <64 x i16> [[TMP2]] +// +v64int16 test_concat(v32int16 a0, v32int16 a1) { return concat(a0, a1); } + +// v32uint32 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x i32> @_Z21test_extract_v8uint32Dv32_ji( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV32_JI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV32_JI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV32_JI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V8UINT32DV32_JI_EXIT]] +// CHECK: _ZL16extract_v8uint32Dv32_ji.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <8 x i32> [[RETVAL_0_I_I]] +// +v8uint32 test_extract_v8uint32(v32uint32 a, int idx) { + return extract_v8uint32(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_insertDv32_jiDv8_j( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV8_J_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV8_J_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV8_J_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV8_J_EXIT]] +// CHECK: _ZL6insertDv32_jiDv8_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32uint32 test_insert(v32uint32 a, int idx, v8uint32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z18test_set_v32uint32iDv8_j( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV8_J_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV8_J_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV8_J_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV8_J_EXIT]] +// CHECK: _ZL13set_v32uint32iDv8_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32uint32 test_set_v32uint32(int idx, v8uint32 b) { + return set_v32uint32(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_concatDv8_jS_S_S_( +// CHECK-SAME: <8 x i32> noundef [[A0:%.*]], <8 x i32> noundef [[A1:%.*]], <8 x i32> noundef [[A2:%.*]], <8 x i32> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[A2]], <8 x i32> [[A3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: ret <32 x i32> [[SHUFFLE2_I_I]] +// +v32uint32 test_concat(v8uint32 a0, v8uint32 a1, v8uint32 a2, v8uint32 a3) { + return concat(a0, a1, a2, a3); +} + +// v32uint32 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z22test_extract_v16uint32Dv32_ji( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT32DV32_JI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL17EXTRACT_V16UINT32DV32_JI_EXIT]] +// CHECK: _ZL17extract_v16uint32Dv32_ji.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16uint32 test_extract_v16uint32(v32uint32 a, int idx) { + return extract_v16uint32(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_insertDv32_jiDv16_j( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV16_J_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_JIDV16_J_EXIT]] +// CHECK: _ZL6insertDv32_jiDv16_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32uint32 test_insert(v32uint32 a, int idx, v16uint32 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z18test_set_v32uint32iDv16_j( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV16_J_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL13SET_V32UINT32IDV16_J_EXIT]] +// CHECK: _ZL13set_v32uint32iDv16_j.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32uint32 test_set_v32uint32(int idx, v16uint32 b) { + return set_v32uint32(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_concatDv16_jS_( +// CHECK-SAME: <16 x i32> noundef [[A0:%.*]], <16 x i32> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[A1]], <32 x i32> +// CHECK-NEXT: ret <32 x i32> [[SHUFFLE_I_I]] +// +v32uint32 test_concat(v16uint32 a0, v16uint32 a1) { return concat(a0, a1); } + +// v32int32 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x i32> @_Z20test_extract_v8int32Dv32_ii( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV32_II_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV32_II_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV32_II_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8INT32DV32_II_EXIT]] +// CHECK: _ZL15extract_v8int32Dv32_ii.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <8 x i32> [[RETVAL_0_I_I]] +// +v8int32 test_extract_v8int32(v32int32 a, int idx) { + return extract_v8int32(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_insertDv32_iiDv8_i( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV8_I_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV8_I_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV8_I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV8_I_EXIT]] +// CHECK: _ZL6insertDv32_iiDv8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32int32 test_insert(v32int32 a, int idx, v8int32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z17test_set_v32int32iDv8_i( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV8_I_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV8_I_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV8_I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV8_I_EXIT]] +// CHECK: _ZL12set_v32int32iDv8_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32int32 test_set_v32int32(int idx, v8int32 b) { return set_v32int32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_concatDv8_iS_S_S_( +// CHECK-SAME: <8 x i32> noundef [[A0:%.*]], <8 x i32> noundef [[A1:%.*]], <8 x i32> noundef [[A2:%.*]], <8 x i32> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[A0]], <8 x i32> [[A1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[A2]], <8 x i32> [[A3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: ret <32 x i32> [[SHUFFLE2_I_I]] +// +v32int32 test_concat(v8int32 a0, v8int32 a1, v8int32 a2, v8int32 a3) { + return concat(a0, a1, a2, a3); +} + +// v32int32 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i32> @_Z21test_extract_v16int32Dv32_ii( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT32DV32_II_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[A]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16INT32DV32_II_EXIT]] +// CHECK: _ZL16extract_v16int32Dv32_ii.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <16 x i32> [[RETVAL_0_I_I]] +// +v16int32 test_extract_v16int32(v32int32 a, int idx) { + return extract_v16int32(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_insertDv32_iiDv16_i( +// CHECK-SAME: <32 x i32> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV16_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[A]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_IIDV16_I_EXIT]] +// CHECK: _ZL6insertDv32_iiDv16_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32int32 test_insert(v32int32 a, int idx, v16int32 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z17test_set_v32int32iDv16_i( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x i32> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV16_I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[B]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32INT32IDV16_I_EXIT]] +// CHECK: _ZL12set_v32int32iDv16_i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: ret <32 x i32> [[RETVAL_0_I_I]] +// +v32int32 test_set_v32int32(int idx, v16int32 b) { return set_v32int32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <32 x i32> @_Z11test_concatDv16_iS_( +// CHECK-SAME: <16 x i32> noundef [[A0:%.*]], <16 x i32> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[A1]], <32 x i32> +// CHECK-NEXT: ret <32 x i32> [[SHUFFLE_I_I]] +// +v32int32 test_concat(v16int32 a0, v16int32 a1) { return concat(a0, a1); } + +// v64bfloat16 + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x bfloat> @_Z24test_extract_v16bfloat16Dv64_u6__bf16i( +// CHECK-SAME: <64 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x bfloat> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV64_U6__BF16I_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV64_U6__BF16I_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV64_U6__BF16I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16BFLOAT16DV64_U6__BF16I_EXIT]] +// CHECK: _ZL19extract_v16bfloat16Dv64_u6__bf16i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <16 x bfloat> +// CHECK-NEXT: ret <16 x bfloat> [[TMP1]] +// +v16bfloat16 test_extract_v16bfloat16(v64bfloat16 a, int idx) { + return extract_v16bfloat16(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z11test_insertDv64_u6__bf16iDv16_u6__bf16( +// CHECK-SAME: <64 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x bfloat> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x bfloat> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV16_U6__BF16_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV16_U6__BF16_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV16_U6__BF16_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV16_U6__BF16_EXIT]] +// CHECK: _ZL6insertDv64_u6__bf16iDv16_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP2]] +// +v64bfloat16 test_insert(v64bfloat16 a, int idx, v16bfloat16 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z20test_set_v64bfloat16iDv16_u6__bf16( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x bfloat> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV16_U6__BF16_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV16_U6__BF16_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV16_U6__BF16_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV16_U6__BF16_EXIT]] +// CHECK: _ZL15set_v64bfloat16iDv16_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP1]] +// +v64bfloat16 test_set_v64bfloat16(int idx, v16bfloat16 b) { + return set_v64bfloat16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z11test_concatDv16_u6__bf16S_S_S_( +// CHECK-SAME: <16 x bfloat> noundef [[A0:%.*]], <16 x bfloat> noundef [[A1:%.*]], <16 x bfloat> noundef [[A2:%.*]], <16 x bfloat> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x bfloat> [[A0]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x bfloat> [[A1]] to <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x bfloat> [[A2]] to <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x bfloat> [[A3]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[SHUFFLE1_I_I]], <32 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i32> [[SHUFFLE2_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP4]] +// +v64bfloat16 test_concat(v16bfloat16 a0, v16bfloat16 a1, v16bfloat16 a2, + v16bfloat16 a3) { + return concat(a0, a1, a2, a3); +} + +// v64bfloat16 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x bfloat> @_Z24test_extract_v32bfloat16Dv64_u6__bf16i( +// CHECK-SAME: <64 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x bfloat> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V32BFLOAT16DV64_U6__BF16I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V32BFLOAT16DV64_U6__BF16I_EXIT]] +// CHECK: _ZL19extract_v32bfloat16Dv64_u6__bf16i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <32 x bfloat> +// CHECK-NEXT: ret <32 x bfloat> [[TMP2]] +// +v32bfloat16 test_extract_v32bfloat16(v64bfloat16 a, int idx) { + return extract_v32bfloat16(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z11test_insertDv64_u6__bf16iDv32_u6__bf16( +// CHECK-SAME: <64 x bfloat> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <32 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <64 x bfloat> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x bfloat> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV32_U6__BF16_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV64_U6__BF16IDV32_U6__BF16_EXIT]] +// CHECK: _ZL6insertDv64_u6__bf16iDv32_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP3]] +// +v64bfloat16 test_insert(v64bfloat16 a, int idx, v32bfloat16 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z20test_set_v64bfloat16iDv32_u6__bf16( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <32 x bfloat> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x bfloat> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV32_U6__BF16_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V64BFLOAT16IDV32_U6__BF16_EXIT]] +// CHECK: _ZL15set_v64bfloat16iDv32_u6__bf16.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP2]] +// +v64bfloat16 test_set_v64bfloat16(int idx, v32bfloat16 b) { + return set_v64bfloat16(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <64 x bfloat> @_Z11test_concatDv32_u6__bf16S_( +// CHECK-SAME: <32 x bfloat> noundef [[A0:%.*]], <32 x bfloat> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x bfloat> [[A0]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x bfloat> [[A1]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[SHUFFLE_I_I]] to <64 x bfloat> +// CHECK-NEXT: ret <64 x bfloat> [[TMP2]] +// +v64bfloat16 test_concat(v32bfloat16 a0, v32bfloat16 a1) { + return concat(a0, a1); +} + +// v32accfloat + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z24test_extract_v16accfloatDv32_u10__accfloati( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16ACCFLOATDV32_U10__ACCFLOATI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL19EXTRACT_V16ACCFLOATDV32_U10__ACCFLOATI_EXIT]] +// CHECK: _ZL19extract_v16accfloatDv32_u10__accfloati.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v16accfloat test_extract_v16accfloat(v32accfloat a, int idx) { + return extract_v16accfloat(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv32_u10__accfloatiDv16_u10__accfloat( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV16_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV16_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL6insertDv32_u10__accfloatiDv16_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP3]] +// +v32accfloat test_insert(v32accfloat a, int idx, v16accfloat b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z20test_set_v32accfloatiDv16_u10__accfloat( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV16_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV16_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL15set_v32accfloatiDv16_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32accfloat test_set_v32accfloat(int idx, v16accfloat b) { + return set_v32accfloat(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv16_u10__accfloatS_( +// CHECK-SAME: <8 x i64> noundef [[A0:%.*]], <8 x i64> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i64> [[A0]], <8 x i64> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP0]] +// +v32accfloat test_concat(v16accfloat a0, v16accfloat a1) { + return concat(a0, a1); +} + +// v32float + +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x float> @_Z20test_extract_v8floatDv32_fi( +// CHECK-SAME: <32 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x float> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV32_FI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV32_FI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV32_FI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8FLOATDV32_FI_EXIT]] +// CHECK: _ZL15extract_v8floatDv32_fi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <8 x float> +// CHECK-NEXT: ret <8 x float> [[TMP1]] +// +v8float test_extract_v8float(v32float a, int idx) { + return extract_v8float(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z11test_insertDv32_fiDv8_f( +// CHECK-SAME: <32 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x float> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV8_F_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV8_F_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV8_F_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV8_F_EXIT]] +// CHECK: _ZL6insertDv32_fiDv8_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <32 x float> +// CHECK-NEXT: ret <32 x float> [[TMP2]] +// +v32float test_insert(v32float a, int idx, v8float b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z17test_set_v32floatiDv8_f( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x float> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV8_F_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV8_F_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV8_F_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV8_F_EXIT]] +// CHECK: _ZL12set_v32floatiDv8_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <32 x float> +// CHECK-NEXT: ret <32 x float> [[TMP1]] +// +v32float test_set_v32float(int idx, v8float b) { return set_v32float(idx, b); } + +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z11test_concatDv8_fS_S_S_( +// CHECK-SAME: <8 x float> noundef [[A0:%.*]], <8 x float> noundef [[A1:%.*]], <8 x float> noundef [[A2:%.*]], <8 x float> noundef [[A3:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_UNCASTED_I:%.*]] = shufflevector <8 x float> [[A0]], <8 x float> [[A1]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_UNCASTED_I:%.*]] = shufflevector <8 x float> [[A2]], <8 x float> [[A3]], <16 x i32> +// CHECK-NEXT: [[SHUFFLE2_I_UNCASTED_I:%.*]] = shufflevector <16 x float> [[SHUFFLE_I_UNCASTED_I]], <16 x float> [[SHUFFLE1_I_UNCASTED_I]], <32 x i32> +// CHECK-NEXT: ret <32 x float> [[SHUFFLE2_I_UNCASTED_I]] +// +v32float test_concat(v8float a0, v8float a1, v8float a2, v8float a3) { + return concat(a0, a1, a2, a3); +} + +// v32float + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x float> @_Z21test_extract_v16floatDv32_fi( +// CHECK-SAME: <32 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x float> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16FLOATDV32_FI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16FLOATDV32_FI_EXIT]] +// CHECK: _ZL16extract_v16floatDv32_fi.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <16 x float> +// CHECK-NEXT: ret <16 x float> [[TMP2]] +// +v16float test_extract_v16float(v32float a, int idx) { + return extract_v16float(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z11test_insertDv32_fiDv16_f( +// CHECK-SAME: <32 x float> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <16 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <32 x float> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x float> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV16_F_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_FIDV16_F_EXIT]] +// CHECK: _ZL6insertDv32_fiDv16_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <32 x float> +// CHECK-NEXT: ret <32 x float> [[TMP3]] +// +v32float test_insert(v32float a, int idx, v16float b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z17test_set_v32floatiDv16_f( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <16 x float> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x float> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV16_F_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32FLOATIDV16_F_EXIT]] +// CHECK: _ZL12set_v32floatiDv16_f.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <32 x float> +// CHECK-NEXT: ret <32 x float> [[TMP2]] +// +v32float test_set_v32float(int idx, v16float b) { return set_v32float(idx, b); } + +// CHECK-LABEL: define dso_local noundef <32 x float> @_Z11test_concatDv16_fS_( +// CHECK-SAME: <16 x float> noundef [[A0:%.*]], <16 x float> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SHUFFLE_I_UNCASTED_I:%.*]] = shufflevector <16 x float> [[A0]], <16 x float> [[A1]], <32 x i32> +// CHECK-NEXT: ret <32 x float> [[SHUFFLE_I_UNCASTED_I]] +// +v32float test_concat(v16float a0, v16float a1) { return concat(a0, a1); } + +// v32acc32 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z21test_extract_v16acc32Dv32_u7__acc32i( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16ACC32DV32_U7__ACC32I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL16EXTRACT_V16ACC32DV32_U7__ACC32I_EXIT]] +// CHECK: _ZL16extract_v16acc32Dv32_u7__acc32i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v16acc32 test_extract_v16acc32(v32acc32 a, int idx) { + return extract_v16acc32(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv32_u7__acc32iDv16_u7__acc32( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV16_U7__ACC32_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV16_U7__ACC32_EXIT]] +// CHECK: _ZL6insertDv32_u7__acc32iDv16_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP3]] +// +v32acc32 test_insert(v32acc32 a, int idx, v16acc32 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z17test_set_v32acc32iDv16_u7__acc32( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV16_U7__ACC32_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV16_U7__ACC32_EXIT]] +// CHECK: _ZL12set_v32acc32iDv16_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32acc32 test_set_v32acc32(int idx, v16acc32 b) { return set_v32acc32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv16_u7__acc32S_( +// CHECK-SAME: <8 x i64> noundef [[A0:%.*]], <8 x i64> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i64> [[A0]], <8 x i64> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP0]] +// +v32acc32 test_concat(v16acc32 a0, v16acc32 a1) { return concat(a0, a1); } + +// v16acc64 + +//! @name Extract 512-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z20test_extract_v8acc64Dv16_u7__acc64i( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC64DV16_U7__ACC64I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <16 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC64DV16_U7__ACC64I_EXIT]] +// CHECK: _ZL15extract_v8acc64Dv16_u7__acc64i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v8acc64 test_extract_v8acc64(v16acc64 a, int idx) { + return extract_v8acc64(a, idx); +} + +//! @name Insert 512-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv16_u7__acc64iDv8_u7__acc64( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV8_U7__ACC64_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV8_U7__ACC64_EXIT]] +// CHECK: _ZL6insertDv16_u7__acc64iDv8_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP3]] +// +v16acc64 test_insert(v16acc64 a, int idx, v8acc64 b) { + return insert(a, idx, b); +} + +//! @name Set 512-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z17test_set_v16acc64iDv8_u7__acc64( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <8 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[B]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV8_U7__ACC64_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV8_U7__ACC64_EXIT]] +// CHECK: _ZL12set_v16acc64iDv8_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v16acc64 test_set_v16acc64(int idx, v8acc64 b) { return set_v16acc64(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv8_u7__acc64S_( +// CHECK-SAME: <8 x i64> noundef [[A0:%.*]], <8 x i64> noundef [[A1:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i64> [[A0]], <8 x i64> [[A1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP0]] +// +v16acc64 test_concat(v8acc64 a0, v8acc64 a1) { return concat(a0, a1); } + +// Tests for 128-bit are skipped due to a lack of support for 128-bit vectors in +// if statements + +// Conversions +// v8accfloat +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z23test_extract_v8accfloatDv16_u10__accfloati( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV16_U10__ACCFLOATI_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV16_U10__ACCFLOATI_EXIT]] +// CHECK: _ZL18extract_v8accfloatDv16_u10__accfloati.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP2]] +// +v8accfloat test_extract_v8accfloat(v16accfloat a, int idx) { + return extract_v8accfloat(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_insertDv16_u10__accfloatiDv8_u10__accfloat( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL6insertDv16_u10__accfloatiDv8_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP3]] +// +v16accfloat test_insert(v16accfloat a, int idx, v8accfloat b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z20test_set_v16accfloatiDv8_u10__accfloat( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V16ACCFLOATIDV8_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V16ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL15set_v16accfloatiDv8_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v16accfloat test_set_v16accfloat(int idx, v8accfloat b) { + return set_v16accfloat(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_concatDv8_u10__accfloatS_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i64> [[TMP0]] +// +v16accfloat test_concat(v8accfloat a, v8accfloat b) { return concat(a, b); } +// v8acc32 +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv16_u7__acc32i( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV16_U7__ACC32I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV16_U7__ACC32I_EXIT]] +// CHECK: _ZL15extract_v8acc32Dv16_u7__acc32i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP2]] +// +v8acc32 test_extract_v8acc32(v16acc32 a, int idx) { + return extract_v8acc32(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_insertDv16_u7__acc32iDv8_u7__acc32( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC32IDV8_U7__ACC32_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: _ZL6insertDv16_u7__acc32iDv8_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP3]] +// +v16acc32 test_insert(v16acc32 a, int idx, v8acc32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z17test_set_v16acc32iDv8_u7__acc32( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC32IDV8_U7__ACC32_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: _ZL12set_v16acc32iDv8_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v16acc32 test_set_v16acc32(int idx, v8acc32 b) { return set_v16acc32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_concatDv8_u7__acc32S_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i64> [[TMP0]] +// +v16acc32 test_concat(v8acc32 a, v8acc32 b) { return concat(a, b); } +// v4acc64 +//! @name Extract 256-bit portion from 512-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv8_u7__acc64i( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV8_U7__ACC64I_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV8_U7__ACC64I_EXIT]] +// CHECK: _ZL15extract_v4acc64Dv8_u7__acc64i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP2]] +// +v4acc64 test_extract_v4acc64(v8acc64 a, int idx) { + return extract_v4acc64(a, idx); +} + +//! @name Insert 256-bit in 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_insertDv8_u7__acc64iDv4_u7__acc64( +// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i64> [[A]] to <16 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV8_U7__ACC64IDV4_U7__ACC64_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> [[TMP0]], <16 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV8_U7__ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: _ZL6insertDv8_u7__acc64iDv4_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE2_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP3]] +// +v8acc64 test_insert(v8acc64 a, int idx, v4acc64 b) { return insert(a, idx, b); } + +//! @name Set 256-bit portion of 512-bit register +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z16test_set_v8acc64iDv4_u7__acc64( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IDX]], 1 +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[TMP1]], 0 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_ELSE_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL11SET_V8ACC64IDV4_U7__ACC64_EXIT:%.*]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: br label [[_ZL11SET_V8ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: _ZL11set_v8acc64iDv4_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <16 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE1_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[RETVAL_0_I_I]] to <8 x i64> +// CHECK-NEXT: ret <8 x i64> [[TMP2]] +// +v8acc64 test_set_v8acc64(int idx, v4acc64 b) { return set_v8acc64(idx, b); } + +// CHECK-LABEL: define dso_local noundef <8 x i64> @_Z11test_concatDv4_u7__acc64S_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i64> [[TMP0]] +// +v8acc64 test_concat(v4acc64 a, v4acc64 b) { return concat(a, b); } +// v8accfloat +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z23test_extract_v8accfloatDv32_u10__accfloati( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV32_U10__ACCFLOATI_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV32_U10__ACCFLOATI_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV32_U10__ACCFLOATI_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL18EXTRACT_V8ACCFLOATDV32_U10__ACCFLOATI_EXIT]] +// CHECK: _ZL18extract_v8accfloatDv32_u10__accfloati.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP1]] +// +v8accfloat test_extract_v8accfloat(v32accfloat a, int idx) { + return extract_v8accfloat(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv32_u10__accfloatiDv8_u10__accfloat( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U10__ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL6insertDv32_u10__accfloatiDv8_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32accfloat test_insert(v32accfloat a, int idx, v8accfloat b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z20test_set_v32accfloatiDv8_u10__accfloat( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV8_U10__ACCFLOAT_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL15SET_V32ACCFLOATIDV8_U10__ACCFLOAT_EXIT]] +// CHECK: _ZL15set_v32accfloatiDv8_u10__accfloat.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP1]] +// +v32accfloat test_set_v32accfloat(int idx, v8accfloat b) { + return set_v32accfloat(idx, b); +} + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv8_u10__accfloatS_S_S_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]], <4 x i64> noundef [[C:%.*]], <4 x i64> noundef [[D:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[C]], <4 x i64> [[D]], <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> [[TMP1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32accfloat test_concat(v8accfloat a, v8accfloat b, v8accfloat c, + v8accfloat d) { + return concat(a, b, c, d); +} +// v8acc32 +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z20test_extract_v8acc32Dv32_u7__acc32i( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV32_U7__ACC32I_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV32_U7__ACC32I_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV32_U7__ACC32I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V8ACC32DV32_U7__ACC32I_EXIT]] +// CHECK: _ZL15extract_v8acc32Dv32_u7__acc32i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP1]] +// +v8acc32 test_extract_v8acc32(v32acc32 a, int idx) { + return extract_v8acc32(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv32_u7__acc32iDv8_u7__acc32( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV8_U7__ACC32_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV32_U7__ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: _ZL6insertDv32_u7__acc32iDv8_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32acc32 test_insert(v32acc32 a, int idx, v8acc32 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z17test_set_v32acc32iDv8_u7__acc32( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV8_U7__ACC32_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V32ACC32IDV8_U7__ACC32_EXIT]] +// CHECK: _ZL12set_v32acc32iDv8_u7__acc32.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP1]] +// +v32acc32 test_set_v32acc32(int idx, v8acc32 b) { return set_v32acc32(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv8_u7__acc32S_S_S_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]], <4 x i64> noundef [[C:%.*]], <4 x i64> noundef [[D:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[C]], <4 x i64> [[D]], <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> [[TMP1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v32acc32 test_concat(v8acc32 a, v8acc32 b, v8acc32 c, v8acc32 d) { + return concat(a, b, c, d); +} +// v4acc64 +//! @name Extract 256-bit portion from 1024-bit register +// CHECK-LABEL: define dso_local noundef <4 x i64> @_Z20test_extract_v4acc64Dv16_u7__acc64i( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN3_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN8_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV16_U7__ACC64I_EXIT:%.*]] +// CHECK: if.then3.i.i: +// CHECK-NEXT: [[SHUFFLE4_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV16_U7__ACC64I_EXIT]] +// CHECK: if.then8.i.i: +// CHECK-NEXT: [[SHUFFLE9_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV16_U7__ACC64I_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE10_I_I:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <8 x i32> +// CHECK-NEXT: br label [[_ZL15EXTRACT_V4ACC64DV16_U7__ACC64I_EXIT]] +// CHECK: _ZL15extract_v4acc64Dv16_u7__acc64i.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <8 x i32> [ [[SHUFFLE_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE4_I_I]], [[IF_THEN3_I_I]] ], [ [[SHUFFLE9_I_I]], [[IF_THEN8_I_I]] ], [ [[SHUFFLE10_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32> [[RETVAL_0_I_I]] to <4 x i64> +// CHECK-NEXT: ret <4 x i64> [[TMP1]] +// +v4acc64 test_extract_v4acc64(v16acc64 a, int idx) { + return extract_v4acc64(a, idx); +} + +//! @name Insert 256-bit in 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_insertDv16_u7__acc64iDv4_u7__acc64( +// CHECK-SAME: <16 x i64> noundef [[A:%.*]], i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i64> [[A]] to <32 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN5_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE2_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV4_U7__ACC64_EXIT:%.*]] +// CHECK: if.then5.i.i: +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <32 x i32> [[SHUFFLE1_I_I]], <32 x i32> [[TMP0]], <32 x i32> +// CHECK-NEXT: br label [[_ZL6INSERTDV16_U7__ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: _ZL6insertDv16_u7__acc64iDv4_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE2_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN5_I_I]] ], [ [[SHUFFLE11_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v16acc64 test_insert(v16acc64 a, int idx, v4acc64 b) { + return insert(a, idx, b); +} + +//! @name Set 256-bit portion of 1024-bit register +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z17test_set_v16acc64iDv4_u7__acc64( +// CHECK-SAME: i32 noundef [[IDX:%.*]], <4 x i64> noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32> +// CHECK-NEXT: [[REM_I_I:%.*]] = srem i32 [[IDX]], 4 +// CHECK-NEXT: switch i32 [[REM_I_I]], label [[IF_ELSE_I_I:%.*]] [ +// CHECK-NEXT: i32 0, label [[IF_THEN_I_I:%.*]] +// CHECK-NEXT: i32 1, label [[IF_THEN4_I_I:%.*]] +// CHECK-NEXT: i32 2, label [[IF_THEN10_I_I:%.*]] +// CHECK-NEXT: ] +// CHECK: if.then.i.i: +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE1_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV4_U7__ACC64_EXIT:%.*]] +// CHECK: if.then4.i.i: +// CHECK-NEXT: [[SHUFFLE5_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE6_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE5_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: if.then10.i.i: +// CHECK-NEXT: [[SHUFFLE11_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE12_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE11_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: if.else.i.i: +// CHECK-NEXT: [[SHUFFLE13_I_I:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <16 x i32> +// CHECK-NEXT: [[SHUFFLE14_I_I:%.*]] = shufflevector <16 x i32> [[SHUFFLE13_I_I]], <16 x i32> undef, <32 x i32> +// CHECK-NEXT: br label [[_ZL12SET_V16ACC64IDV4_U7__ACC64_EXIT]] +// CHECK: _ZL12set_v16acc64iDv4_u7__acc64.exit: +// CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi <32 x i32> [ [[SHUFFLE1_I_I]], [[IF_THEN_I_I]] ], [ [[SHUFFLE6_I_I]], [[IF_THEN4_I_I]] ], [ [[SHUFFLE12_I_I]], [[IF_THEN10_I_I]] ], [ [[SHUFFLE14_I_I]], [[IF_ELSE_I_I]] ] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i32> [[RETVAL_0_I_I]] to <16 x i64> +// CHECK-NEXT: ret <16 x i64> [[TMP1]] +// +v16acc64 test_set_v16acc64(int idx, v4acc64 b) { return set_v16acc64(idx, b); } + +// CHECK-LABEL: define dso_local noundef <16 x i64> @_Z11test_concatDv4_u7__acc64S_S_S_( +// CHECK-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]], <4 x i64> noundef [[C:%.*]], <4 x i64> noundef [[D:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[C]], <4 x i64> [[D]], <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> [[TMP1]], <16 x i32> +// CHECK-NEXT: ret <16 x i64> [[TMP2]] +// +v16acc64 test_concat(v4acc64 a, v4acc64 b, v4acc64 c, v4acc64 d) { + return concat(a, b, c, d); +} diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b2132562ac3f..2a0168635100 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -27,6 +27,7 @@ #include "llvm/CodeGenTypes/LowLevelType.h" #include "llvm/IR/InstrTypes.h" #include +#include namespace llvm { @@ -256,8 +257,21 @@ class CombinerHelper { /// concat_vectors. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. - bool matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops); + using GeneratorType = std::function()>; + + bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator, + const size_t TargetDstSize); + + /// Create G_UNMERGE_VECTOR instructions until the source has reached a + /// target vector size. + /// + /// Requires that the destination fits evenly in the source register. It + /// allows you to pass which of the different destination sized slices + /// you require. + Register createUnmergeValue(MachineInstr &MI, const Register SrcReg, + const Register DstReg, uint8_t DestinationIndex, + const uint32_t Start, const uint32_t End); + /// Replace \p MI with a concat_vectors with \p Ops. void applyCombineShuffleVector(MachineInstr &MI, const ArrayRef Ops); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec7ca5dc8e2b..101534105116 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -42,6 +42,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include +#include +#include #include #include @@ -371,7 +373,6 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI, SmallVector &Ops) { LLT SrcTy = MRI.getType(Ops[0]); Register UndefReg = 0; - for (unsigned i = 0; i < Ops.size(); i++) { if (Ops[i] == 0) { if (UndefReg == 0) @@ -384,17 +385,265 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI, MI.eraseFromParent(); } +// Create a stream from 0 to n with a specified number of steps +CombinerHelper::GeneratorType +adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { + int32_t Counter = From; + return [Counter, To, StepSize]() mutable { + std::optional OldCount = std::optional(Counter); + Counter += StepSize; + if (OldCount == (To + StepSize)) + OldCount = {}; + return OldCount; + }; +} + +// Move to the next generator if it is exhausted allowing to chain generators +std::function()> concatGenerators( + SmallVector()>> Generators) { + auto *GeneratorIterator = Generators.begin(); + + return [GeneratorIterator, Generators]() mutable { + std::optional GenValue = (*GeneratorIterator)(); + if (!GenValue.has_value() && GeneratorIterator != Generators.end()) { + GeneratorIterator++; + GenValue = (*GeneratorIterator)(); + } + return GenValue; + }; +} + +Register CombinerHelper::createUnmergeValue( + MachineInstr &MI, const Register SrcReg, const Register DstReg, + const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) { + Builder.setInsertPt(*MI.getParent(), MI); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + assert((DstTy.isScalar() || + (SrcTy.getNumElements() % DstTy.getNumElements()) == 0) && + "destination vector must divide source cleanly"); + + const unsigned HalfElements = SrcTy.getNumElements() / 2; + const LLT ScalarTy = SrcTy.getScalarType(); + const LLT HalfSizeTy = (HalfElements == 1) + ? ScalarTy + : LLT::fixed_vector(HalfElements, ScalarTy); + const Register TmpReg = MRI.createGenericVirtualRegister(HalfSizeTy); + Register TargetReg = DstReg; + if (DstTy != HalfSizeTy) { + TargetReg = MRI.createGenericVirtualRegister(HalfSizeTy); + } + + // Each destination fits n times into the source and each iteration we + // exactly half the source. Therefore we need to pick on which side we want + // to iterate on. + const uint32_t DstNumElements = + DstTy.isVector() ? DstTy.getNumElements() : 1; + const uint32_t HalfWay = Start + ((End - Start) / 2); + const uint32_t Position = DestinationIndex * DstNumElements; + + uint32_t NextStart, NextEnd; + if (Position < HalfWay) { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TargetReg, TmpReg}, + {SrcReg}); + NextStart = Start; + NextEnd = HalfWay; + } else { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TmpReg, TargetReg}, + {SrcReg}); + NextStart = HalfWay; + NextEnd = End; + } + + if (HalfSizeTy.isVector() && DstTy != HalfSizeTy) + return createUnmergeValue(MI, TargetReg, DstReg, DestinationIndex, + NextStart, NextEnd); + + return DstReg; +} + bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { + const Register DstReg = MI.getOperand(0).getReg(); + const Register SrcReg1 = MI.getOperand(1).getReg(); + const Register SrcReg2 = MI.getOperand(2).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + + const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; + const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + + // This test is a bit silly, but it is required because some tests rely on + // the legalizer changing the type of the shufflevector. + if (DstTy.getScalarSizeInBits() == 1) + return false; + + // {1, 2, ..., n} -> G_CONCAT_VECTOR + // Turns a shuffle vector that only increments into a concat vector + // instruction + GeneratorType CountUp = adderGenerator(0, DstNumElts - 1, 1); SmallVector Ops; - if (matchCombineShuffleVector(MI, Ops)) { + + if (matchCombineShuffleVector(MI, CountUp, 2 * SrcNumElts)) { + // The shuffle is concatenating multiple vectors together. + // Collect the different operands for that. + Register UndefReg; + const Register Src1 = MI.getOperand(1).getReg(); + const Register Src2 = MI.getOperand(2).getReg(); + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + + // The destination can be longer than the source, so we separate them into + // equal blocks and check them separately to see if one of the blocks can be + // copied whole. + unsigned NumConcat = DstNumElts / SrcNumElts; + unsigned Index = 0; + for (unsigned Concat = 0; Concat < NumConcat; Concat++) { + unsigned Target = (Concat + 1) * SrcNumElts; + while (Index < Target) { + int MaskElt = Mask[Index]; + if (MaskElt >= 0) { + Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2); + break; + } + Index++; + } + + if (Index == Target) { + if (!UndefReg) { + Builder.setInsertPt(*MI.getParent(), MI); + UndefReg = Builder.buildUndef(SrcTy).getReg(0); + } + Ops.push_back(UndefReg); + } + + Index = Target; + } + applyCombineShuffleVector(MI, Ops); return true; } + + // {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES + // Extracts the first chunk of the same size of the destination vector from + // the source + GeneratorType FirstQuarter = adderGenerator(0, DstNumElts - 1, 1); + if (matchCombineShuffleVector(MI, FirstQuarter, DstNumElts - 1)) { + // This optimization does not work if the target type is not a power of two, + // this can happen in some backends that support uneven vector types. We + // also need to make sure that the vector can be split into two. + if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0 || + SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 0, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + + // {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES + // Extracts the second chunk of the same size of the destination vector from + // the source + GeneratorType SecondQuarter = + adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1); + if (matchCombineShuffleVector(MI, SecondQuarter, DstNumElts - 1)) { + if (((SrcNumElts / 2) % 2) != 0 || SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 1, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + + // After this point, it is assumed our shufflevectors work on vectors that can + // be splint into two + if ((DstNumElts % 2) != 0) + return false; + + // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES + // Take the first halfs of the two vectors and concatenate them into one + // vector. + GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1); + GeneratorType FirstEightB = + adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1); + + GeneratorType FirstAndThird = + concatGenerators(SmallVector{FirstEightA, FirstEightB}); + if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) { + if (DstNumElts <= 2) + return false; + const Register DstReg = MI.getOperand(0).getReg(); + const LLT HalfSrcTy = + LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType()); + const Register HalfOfA = createUnmergeValue( + MI, MI.getOperand(1).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + const Register HalfOfB = createUnmergeValue( + MI, MI.getOperand(2).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + if (Mask[0] <= 0) { + Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB}); + } else { + Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA}); + } + + MI.eraseFromParent(); + return true; + } + + // {n/2, n/2+1, ..., n, 0, 1, ..., n/2-1} + GeneratorType FirstHalf = adderGenerator(0, SrcNumElts / 2, 1); + GeneratorType SecondHalf = adderGenerator(SrcNumElts / 2, SrcNumElts, 1); + GeneratorType Reverse = + concatGenerators(SmallVector{FirstHalf, SecondHalf}); + + if (matchCombineShuffleVector(MI, Reverse, SrcNumElts)) { + // The shuffle is concatenating multiple vectors together. + // Collect the different operands for that. + Register UndefReg; + const Register Src1 = MI.getOperand(1).getReg(); + const Register Src2 = MI.getOperand(2).getReg(); + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + + // The destination can be longer than the source, so we separate them into + // equal blocks and check them separately to see if one of the blocks can be + // copied whole. + unsigned NumConcat = DstNumElts / SrcNumElts; + unsigned Index = 0; + for (unsigned Concat = 0; Concat < NumConcat; Concat++) { + unsigned Target = (Concat + 1) * SrcNumElts; + while (Index < Target) { + int MaskElt = Mask[Index]; + if (MaskElt >= 0) { + Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2); + break; + } + Index++; + } + + if (Index == Target) { + if (!UndefReg) { + Builder.setInsertPt(*MI.getParent(), MI); + UndefReg = Builder.buildUndef(SrcTy).getReg(0); + } + Ops.push_back(UndefReg); + } + + Index = Target; + } + applyCombineShuffleVector(MI, {Ops[1], Ops[0]}); + return true; + } + return false; } bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops) { + GeneratorType Generator, + const size_t TargetDstSize) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && "Invalid instruction kind"); LLT DstType = MRI.getType(MI.getOperand(0).getReg()); @@ -421,51 +670,24 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, // // TODO: If the size between the source and destination don't match // we could still emit an extract vector element in that case. - if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) + if ((DstNumElts < TargetDstSize) && DstNumElts != 1) return false; - // Check that the shuffle mask can be broken evenly between the - // different sources. - if (DstNumElts % SrcNumElts != 0) - return false; - - // Mask length is a multiple of the source vector length. - // Check if the shuffle is some kind of concatenation of the input - // vectors. - unsigned NumConcat = DstNumElts / SrcNumElts; - SmallVector ConcatSrcs(NumConcat, -1); ArrayRef Mask = MI.getOperand(3).getShuffleMask(); for (unsigned i = 0; i != DstNumElts; ++i) { int Idx = Mask[i]; + const int32_t ShiftIndex = Generator().value_or(-1); + // Undef value. - if (Idx < 0) + if (Idx < 0 || ShiftIndex < 0) continue; - // Ensure the indices in each SrcType sized piece are sequential and that + + // Ensure the indices in each SrcType sized piece are seqential and that // the same source is used for the whole piece. - if ((Idx % SrcNumElts != (i % SrcNumElts)) || - (ConcatSrcs[i / SrcNumElts] >= 0 && - ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) + if ((Idx % SrcNumElts != (ShiftIndex % SrcNumElts))) return false; - // Remember which source this index came from. - ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts; } - // The shuffle is concatenating multiple vectors together. - // Collect the different operands for that. - Register UndefReg; - Register Src2 = MI.getOperand(2).getReg(); - for (auto Src : ConcatSrcs) { - if (Src < 0) { - if (!UndefReg) { - Builder.setInsertPt(*MI.getParent(), MI); - UndefReg = Builder.buildUndef(SrcType).getReg(0); - } - Ops.push_back(UndefReg); - } else if (Src == 0) - Ops.push_back(Src1); - else - Ops.push_back(Src2); - } return true; } diff --git a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp index 37865902ad13..0c3328ed146c 100644 --- a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsAIE2.h" #include "llvm/InitializePasses.h" @@ -62,9 +63,7 @@ class AIE2PreLegalizerCombinerImpl : public Combiner { const LegalizerInfo *LI); static const char *getName() { return "AIE2PreLegalizerCombiner"; } - bool tryCombineAll(MachineInstr &I) const override; - bool tryCombineAllImpl(MachineInstr &I) const; bool tryToCombineVectorShiftsByZero(MachineInstr &MI) const; @@ -167,6 +166,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { case TargetOpcode::G_INTRINSIC: { return tryToCombineIntrinsic(MI); } + case TargetOpcode::G_SHUFFLE_VECTOR: { + return Helper.tryCombineShuffleVector(MI); + } default: break; } diff --git a/llvm/lib/Target/AIE/AIECombine.td b/llvm/lib/Target/AIE/AIECombine.td index 5b747b0e07fa..03e873e21b18 100644 --- a/llvm/lib/Target/AIE/AIECombine.td +++ b/llvm/lib/Target/AIE/AIECombine.td @@ -63,7 +63,7 @@ def AIE2PreLegalizerCombiner combine_globalval_offset, combine_extract_vector_elt_and_zsa_ext, combine_splat_vector ]> { - let CombineAllMethodName = "tryCombineAllImpl"; + let CombineAllMethodName = "tryCombineAllImpl"; } def AIE2PostLegalizerGenericCombiner diff --git a/llvm/lib/Target/AIE/AIELegalizerInfo.cpp b/llvm/lib/Target/AIE/AIELegalizerInfo.cpp index 9f00e1a33a60..8c456c3e2f08 100644 --- a/llvm/lib/Target/AIE/AIELegalizerInfo.cpp +++ b/llvm/lib/Target/AIE/AIELegalizerInfo.cpp @@ -505,6 +505,15 @@ AIELegalizerInfo::AIELegalizerInfo(const AIEBaseSubtarget &ST) { .clampMaxNumElements(0, S16, 32) .clampMaxNumElements(0, S32, 16) .custom(); + + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .unsupportedIf(IsNotValidDestinationVector) + // Checks if the shuffle is "canonical", this enables additional actions + // in the LLVM combiner and can change shuffle vectors legalization + .lowerIf([=](const LegalityQuery &Query) { + return Query.Types[0] == Query.Types[1]; + }) + .lower(); } getActionDefinitionsBuilder(G_JUMP_TABLE).custom(); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir index 0de989f8be75..b515593e5c4a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir @@ -101,7 +101,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(0, undef, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 @@ -179,7 +181,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(undef, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir index 2c9ae5b06b62..58b1a5ec7602 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir @@ -270,8 +270,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(4, 5, 0, 1) - ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV2]](<2 x s32>), [[UV]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1(<4 x s32>), shufflemask(4,5,0,1) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir index 7db4526ea070..5bf0f6c6186b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir @@ -226,7 +226,6 @@ body: | %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2(<2 x s32>), shufflemask(0, 1) $d0 = COPY %0(<2 x s32>) RET_ReallyLR implicit $d0 - ... --- name: shl_undef_rhs @@ -305,7 +304,6 @@ alignment: 4 tracksRegLiveness: true body: | bb.0: - ; Optimize these to zero? ; CHECK-LABEL: name: ashr_undef_lhs ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 749d6071c98d..cab45c64398a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1776,19 +1776,10 @@ entry: } define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { -; CHECK-SD-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI126_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI126_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %vecinit30 @@ -1803,9 +1794,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov b2, v0.b[1] ; CHECK-GI-NEXT: mov b3, v0.b[2] ; CHECK-GI-NEXT: mov b4, v0.b[3] @@ -1814,14 +1803,13 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-GI-NEXT: mov b7, v0.b[6] ; CHECK-GI-NEXT: mov b16, v0.b[7] ; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0] ; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] ; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] ; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] ; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] ; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] ; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1999,19 +1987,10 @@ entry: } define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { -; CHECK-SD-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI130_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI130_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %vecinit14 @@ -2026,17 +2005,14 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov h3, v0.h[2] ; CHECK-GI-NEXT: mov h4, v0.h[3] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0] ; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] ; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2142,19 +2118,10 @@ entry: } define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { -; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI134_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -2169,13 +2136,10 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov s2, v0.s[1] ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 2c5d33da93c8..db8250db4320 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -42,8 +42,7 @@ define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) { ; ; CHECK-GISEL-LABEL: i8_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> @@ -254,9 +253,7 @@ define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) { ; ; CHECK-GISEL-LABEL: i8_zero_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 3254c5ebe9c6..42c68883351f 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3744,17 +3744,13 @@ define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i1 ; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-GI-NEXT: uaddw2 v0.4s, v4.4s, v0.8h +; CHECK-GI-NEXT: uaddw2 v1.4s, v5.4s, v1.8h +; CHECK-GI-NEXT: uaddw2 v2.4s, v6.4s, v2.8h +; CHECK-GI-NEXT: uaddw2 v3.4s, v7.4s, v3.8h ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AIE/GlobalISel/xfail-legalize-shufflevector.mir b/llvm/test/CodeGen/AIE/GlobalISel/xfail-legalize-shufflevector.mir index 71cd84871cfc..9a03c03f8fa6 100644 --- a/llvm/test/CodeGen/AIE/GlobalISel/xfail-legalize-shufflevector.mir +++ b/llvm/test/CodeGen/AIE/GlobalISel/xfail-legalize-shufflevector.mir @@ -30,39 +30,6 @@ body: | PseudoRET implicit $lr ... ---- -name: f_32x8 -body: | - bb.0: - %1:_(<32 x s8>) = G_IMPLICIT_DEF - %2:_(p0) = G_IMPLICIT_DEF - %0:_(<32 x s8>) = G_SHUFFLE_VECTOR %1(<32 x s8>), %1, shufflemask(undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef) - G_STORE %0(<32 x s8>), %2(p0) :: (store (<32 x s8>) into `<32 x i8>* undef`, align 2) - PseudoRET implicit $lr -... - ---- -name: f_16x16 -body: | - bb.0: - %1:_(<16 x s16>) = G_IMPLICIT_DEF - %2:_(p0) = G_IMPLICIT_DEF - %0:_(<16 x s16>) = G_SHUFFLE_VECTOR %1(<16 x s16>), %1, shufflemask(undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef) - G_STORE %0(<16 x s16>), %2(p0) :: (store (<16 x s16>) into `<16 x i16>* undef`, align 2) - PseudoRET implicit $lr -... - ---- -name: f_32x16 -body: | - bb.0: - %1:_(<32 x s16>) = G_IMPLICIT_DEF - %2:_(p0) = G_IMPLICIT_DEF - %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1(<32 x s16>), %1, shufflemask(undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef, undef) - G_STORE %0(<32 x s16>), %2(p0) :: (store (<32 x s16>) into `<32 x i16>* undef`, align 2) - PseudoRET implicit $lr -... - --- name: f_2x64 body: | @@ -72,4 +39,3 @@ body: | %0:_(<2 x s64>) = G_SHUFFLE_VECTOR %1(<2 x s64>), %1, shufflemask(undef, undef) G_STORE %0(<2 x s64>), %2(p0) :: (store (<2 x s64>) into `<2 x i64>* undef`, align 2) PseudoRET implicit $lr -... diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-shuffle-vector.mir new file mode 100644 index 000000000000..b8a4a91b3c97 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-shuffle-vector.mir @@ -0,0 +1,640 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=legalizer %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: test_shuffle_vec_256_32bit +body: | + bb.0: + liveins: $r6 + ; CHECK-LABEL: name: test_shuffle_vec_256_32bit + ; CHECK: liveins: $r6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C5]](s32) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C6]](s32) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C7]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT8:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT9:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT10:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT11:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT12:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT13:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C5]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT14:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C6]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT15:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C7]](s32) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[DEF1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT15]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[AIE_SEXT_EXTRACT_VECTOR_ELT14]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT13]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[AIE_SEXT_EXTRACT_VECTOR_ELT12]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[AIE_SEXT_EXTRACT_VECTOR_ELT11]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[AIE_SEXT_EXTRACT_VECTOR_ELT10]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[AIE_SEXT_EXTRACT_VECTOR_ELT9]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[AIE_SEXT_EXTRACT_VECTOR_ELT8]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT8:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT7]], [[AIE_SEXT_EXTRACT_VECTOR_ELT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT9:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT8]], [[AIE_SEXT_EXTRACT_VECTOR_ELT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT10:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT9]], [[AIE_SEXT_EXTRACT_VECTOR_ELT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT11:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT10]], [[AIE_SEXT_EXTRACT_VECTOR_ELT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT12:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT11]], [[AIE_SEXT_EXTRACT_VECTOR_ELT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT13:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT12]], [[AIE_SEXT_EXTRACT_VECTOR_ELT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT14:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT13]], [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT15:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT14]], [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_ADD_VECTOR_ELT_LEFT15]](<16 x s32>) + %0:_(<8 x s32>) = G_IMPLICIT_DEF + %1:_(<16 x s32>) = G_SHUFFLE_VECTOR %0(<8 x s32>), %0(<8 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %1 +... + +--- +name: test_shuffle_vec_256_to_512_16bit +body: | + bb.0: + liveins: $r6 + ; CHECK-LABEL: name: test_shuffle_vec_256_to_512_16bit + ; CHECK: liveins: $r6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<16 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT]], 16 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT1:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT1]], 16 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C2]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT2:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT2]], 16 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C3]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT3:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT3]], 16 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C4]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT4:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT4]], 16 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C5]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT5:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT5]], 16 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C6]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT6:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT6]], 16 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C7]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT7:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT7]], 16 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT8:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C8]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT8:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT8]], 16 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT9:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C9]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT9:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT9]], 16 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT10:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C10]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT10:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT10]], 16 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT11:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C11]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT11:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT11]], 16 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT12:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C12]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT12:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT12]], 16 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT13:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C13]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT13:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT13]], 16 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT14:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C14]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT14:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT14]], 16 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT15:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s16>), [[C15]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT15:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT15]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT16:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT16:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT16]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT17:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C1]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT17:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT17]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT18:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C2]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT18:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT18]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT19:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C3]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT19:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT19]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT20:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C4]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT20:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT20]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT21:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C5]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT21:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT21]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT22:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C6]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT22:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT22]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT23:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C7]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT23:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT23]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT24:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C8]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT24:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT24]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT25:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C9]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT25:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT25]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT26:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C10]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT26:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT26]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT27:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C11]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT27:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT27]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT28:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C12]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT28:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT28]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT29:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C13]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT29:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT29]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT30:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C14]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT30:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT30]], 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT31:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<16 x s16>), [[C15]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT31:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT31]], 16 + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<32 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[DEF2]], [[ASSERT_SEXT31]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[ASSERT_SEXT30]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[ASSERT_SEXT29]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[ASSERT_SEXT28]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[ASSERT_SEXT27]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[ASSERT_SEXT26]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[ASSERT_SEXT25]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[ASSERT_SEXT24]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT8:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT7]], [[ASSERT_SEXT23]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT9:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT8]], [[ASSERT_SEXT22]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT10:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT9]], [[ASSERT_SEXT21]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT11:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT10]], [[ASSERT_SEXT20]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT12:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT11]], [[ASSERT_SEXT19]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT13:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT12]], [[ASSERT_SEXT18]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT14:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT13]], [[ASSERT_SEXT17]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT15:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT14]], [[ASSERT_SEXT16]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT16:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT15]], [[ASSERT_SEXT15]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT17:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT16]], [[ASSERT_SEXT14]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT18:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT17]], [[ASSERT_SEXT13]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT19:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT18]], [[ASSERT_SEXT12]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT20:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT19]], [[ASSERT_SEXT11]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT21:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT20]], [[ASSERT_SEXT10]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT22:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT21]], [[ASSERT_SEXT9]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT23:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT22]], [[ASSERT_SEXT8]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT24:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT23]], [[ASSERT_SEXT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT25:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT24]], [[ASSERT_SEXT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT26:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT25]], [[ASSERT_SEXT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT27:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT26]], [[ASSERT_SEXT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT28:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT27]], [[ASSERT_SEXT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT29:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT28]], [[ASSERT_SEXT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT30:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT29]], [[ASSERT_SEXT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT31:%[0-9]+]]:_(<32 x s16>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT30]], [[ASSERT_SEXT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_ADD_VECTOR_ELT_LEFT31]](<32 x s16>) + %0:_(<16 x s16>) = G_IMPLICIT_DEF + %1:_(<16 x s16>) = G_IMPLICIT_DEF + %2:_(<32 x s16>) = G_SHUFFLE_VECTOR %0(<16 x s16>), %1(<16 x s16>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: test_shuffle_vec_256_to_512_8bit +body: | + bb.0: + liveins: $r6 + ; CHECK-LABEL: name: test_shuffle_vec_256_to_512_8bit + ; CHECK: liveins: $r6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<32 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<32 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT]], 8 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C1]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT1:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT1]], 8 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C2]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT2:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT2]], 8 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C3]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT3:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT3]], 8 + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C4]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT4:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT4]], 8 + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C5]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT5:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT5]], 8 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C6]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT6:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT6]], 8 + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C7]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT7:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT7]], 8 + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT8:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C8]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT8:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT8]], 8 + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT9:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C9]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT9:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT9]], 8 + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT10:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C10]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT10:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT10]], 8 + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT11:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C11]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT11:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT11]], 8 + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT12:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C12]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT12:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT12]], 8 + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT13:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C13]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT13:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT13]], 8 + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT14:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C14]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT14:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT14]], 8 + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT15:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C15]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT15:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT15]], 8 + ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT16:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C16]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT16:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT16]], 8 + ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT17:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C17]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT17:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT17]], 8 + ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 18 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT18:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C18]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT18:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT18]], 8 + ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 19 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT19:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C19]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT19:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT19]], 8 + ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT20:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C20]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT20:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT20]], 8 + ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 21 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT21:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C21]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT21:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT21]], 8 + ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 22 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT22:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C22]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT22:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT22]], 8 + ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 23 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT23:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C23]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT23:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT23]], 8 + ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT24:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C24]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT24:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT24]], 8 + ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT25:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C25]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT25:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT25]], 8 + ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 26 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT26:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C26]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT26:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT26]], 8 + ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 27 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT27:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C27]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT27:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT27]], 8 + ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 28 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT28:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C28]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT28:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT28]], 8 + ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT29:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C29]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT29:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT29]], 8 + ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT30:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C30]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT30:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT30]], 8 + ; CHECK-NEXT: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT31:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<32 x s8>), [[C31]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT31:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT31]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT32:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT32:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT32]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT33:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C1]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT33:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT33]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT34:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C2]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT34:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT34]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT35:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C3]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT35:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT35]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT36:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C4]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT36:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT36]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT37:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C5]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT37:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT37]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT38:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C6]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT38:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT38]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT39:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C7]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT39:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT39]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT40:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C8]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT40:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT40]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT41:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C9]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT41:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT41]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT42:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C10]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT42:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT42]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT43:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C11]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT43:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT43]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT44:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C12]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT44:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT44]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT45:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C13]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT45:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT45]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT46:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C14]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT46:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT46]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT47:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C15]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT47:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT47]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT48:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C16]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT48:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT48]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT49:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C17]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT49:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT49]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT50:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C18]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT50:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT50]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT51:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C19]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT51:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT51]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT52:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C20]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT52:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT52]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT53:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C21]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT53:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT53]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT54:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C22]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT54:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT54]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT55:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C23]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT55:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT55]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT56:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C24]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT56:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT56]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT57:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C25]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT57:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT57]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT58:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C26]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT58:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT58]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT59:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C27]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT59:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT59]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT60:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C28]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT60:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT60]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT61:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C29]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT61:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT61]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT62:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C30]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT62:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT62]], 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT63:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF1]](<32 x s8>), [[C31]](s32) + ; CHECK-NEXT: [[ASSERT_SEXT63:%[0-9]+]]:_(s32) = G_ASSERT_SEXT [[AIE_SEXT_EXTRACT_VECTOR_ELT63]], 8 + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<64 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[DEF2]], [[ASSERT_SEXT63]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[ASSERT_SEXT62]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[ASSERT_SEXT61]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[ASSERT_SEXT60]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[ASSERT_SEXT59]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[ASSERT_SEXT58]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[ASSERT_SEXT57]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[ASSERT_SEXT56]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT8:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT7]], [[ASSERT_SEXT55]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT9:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT8]], [[ASSERT_SEXT54]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT10:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT9]], [[ASSERT_SEXT53]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT11:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT10]], [[ASSERT_SEXT52]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT12:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT11]], [[ASSERT_SEXT51]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT13:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT12]], [[ASSERT_SEXT50]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT14:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT13]], [[ASSERT_SEXT49]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT15:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT14]], [[ASSERT_SEXT48]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT16:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT15]], [[ASSERT_SEXT47]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT17:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT16]], [[ASSERT_SEXT46]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT18:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT17]], [[ASSERT_SEXT45]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT19:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT18]], [[ASSERT_SEXT44]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT20:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT19]], [[ASSERT_SEXT43]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT21:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT20]], [[ASSERT_SEXT42]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT22:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT21]], [[ASSERT_SEXT41]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT23:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT22]], [[ASSERT_SEXT40]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT24:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT23]], [[ASSERT_SEXT39]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT25:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT24]], [[ASSERT_SEXT38]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT26:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT25]], [[ASSERT_SEXT37]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT27:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT26]], [[ASSERT_SEXT36]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT28:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT27]], [[ASSERT_SEXT35]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT29:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT28]], [[ASSERT_SEXT34]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT30:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT29]], [[ASSERT_SEXT33]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT31:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT30]], [[ASSERT_SEXT32]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT32:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT31]], [[ASSERT_SEXT31]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT33:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT32]], [[ASSERT_SEXT30]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT34:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT33]], [[ASSERT_SEXT29]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT35:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT34]], [[ASSERT_SEXT28]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT36:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT35]], [[ASSERT_SEXT27]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT37:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT36]], [[ASSERT_SEXT26]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT38:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT37]], [[ASSERT_SEXT25]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT39:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT38]], [[ASSERT_SEXT24]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT40:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT39]], [[ASSERT_SEXT23]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT41:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT40]], [[ASSERT_SEXT22]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT42:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT41]], [[ASSERT_SEXT21]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT43:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT42]], [[ASSERT_SEXT20]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT44:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT43]], [[ASSERT_SEXT19]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT45:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT44]], [[ASSERT_SEXT18]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT46:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT45]], [[ASSERT_SEXT17]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT47:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT46]], [[ASSERT_SEXT16]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT48:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT47]], [[ASSERT_SEXT15]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT49:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT48]], [[ASSERT_SEXT14]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT50:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT49]], [[ASSERT_SEXT13]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT51:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT50]], [[ASSERT_SEXT12]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT52:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT51]], [[ASSERT_SEXT11]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT53:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT52]], [[ASSERT_SEXT10]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT54:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT53]], [[ASSERT_SEXT9]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT55:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT54]], [[ASSERT_SEXT8]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT56:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT55]], [[ASSERT_SEXT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT57:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT56]], [[ASSERT_SEXT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT58:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT57]], [[ASSERT_SEXT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT59:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT58]], [[ASSERT_SEXT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT60:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT59]], [[ASSERT_SEXT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT61:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT60]], [[ASSERT_SEXT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT62:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT61]], [[ASSERT_SEXT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT63:%[0-9]+]]:_(<64 x s8>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT62]], [[ASSERT_SEXT]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_ADD_VECTOR_ELT_LEFT63]](<64 x s8>) + %0:_(<32 x s8>) = G_IMPLICIT_DEF + %1:_(<32 x s8>) = G_IMPLICIT_DEF + %2:_(<64 x s8>) = G_SHUFFLE_VECTOR %0(<32 x s8>), %1(<32 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: test_shuffle_vec_128_to_256_32bit +body: | + bb.0: + ; CHECK-LABEL: name: test_shuffle_vec_128_to_256_32bit + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF1]](<16 x s32>) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[COPY]], [[DEF]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[DEF]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[DEF]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[DEF]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[C]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[C]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[C]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[C]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[AIE_ADD_VECTOR_ELT_LEFT7]](<16 x s32>) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[UV]](<8 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT8:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[DEF1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT9:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT8]], [[AIE_SEXT_EXTRACT_VECTOR_ELT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT10:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT9]], [[AIE_SEXT_EXTRACT_VECTOR_ELT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT11:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT10]], [[AIE_SEXT_EXTRACT_VECTOR_ELT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT12:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT11]], [[AIE_SEXT_EXTRACT_VECTOR_ELT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT13:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT12]], [[AIE_SEXT_EXTRACT_VECTOR_ELT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT14:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT13]], [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT15:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT14]], [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[AIE_ADD_VECTOR_ELT_LEFT15]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV2]](<8 x s32>) + %0:_(s32) = G_CONSTANT i32 42 + %1:_(<4 x s32>) = G_BUILD_VECTOR %0(s32), %0(s32), %0(s32), %0(s32) + %2:_(<8 x s32>) = G_SHUFFLE_VECTOR %1(<4 x s32>), %1(<4 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: test_shuffle_vec_512_to_1024_32bit +body: | + bb.0: + ; CHECK-LABEL: name: test_shuffle_vec_512_to_1024_32bit + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C5]](s32) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C6]](s32) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C7]](s32) + ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT8:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C8]](s32) + ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 9 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT9:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C9]](s32) + ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT10:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C10]](s32) + ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 11 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT11:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C11]](s32) + ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT12:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C12]](s32) + ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT13:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C13]](s32) + ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT14:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C14]](s32) + ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT15:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C15]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT16:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT17:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT18:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT19:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT20:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT21:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C5]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT22:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C6]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT23:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C7]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT24:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C8]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT25:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C9]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT26:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C10]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT27:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C11]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT28:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C12]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT29:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C13]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT30:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C14]](s32) + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT31:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<16 x s32>), [[C15]](s32) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[COPY]], [[AIE_SEXT_EXTRACT_VECTOR_ELT15]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[AIE_SEXT_EXTRACT_VECTOR_ELT14]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT13]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[AIE_SEXT_EXTRACT_VECTOR_ELT12]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[AIE_SEXT_EXTRACT_VECTOR_ELT11]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[AIE_SEXT_EXTRACT_VECTOR_ELT10]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[AIE_SEXT_EXTRACT_VECTOR_ELT9]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[AIE_SEXT_EXTRACT_VECTOR_ELT8]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT8:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT7]], [[AIE_SEXT_EXTRACT_VECTOR_ELT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT9:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT8]], [[AIE_SEXT_EXTRACT_VECTOR_ELT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT10:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT9]], [[AIE_SEXT_EXTRACT_VECTOR_ELT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT11:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT10]], [[AIE_SEXT_EXTRACT_VECTOR_ELT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT12:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT11]], [[AIE_SEXT_EXTRACT_VECTOR_ELT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT13:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT12]], [[AIE_SEXT_EXTRACT_VECTOR_ELT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT14:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT13]], [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT15:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT14]], [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT16:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[COPY1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT31]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT17:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT16]], [[AIE_SEXT_EXTRACT_VECTOR_ELT30]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT18:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT17]], [[AIE_SEXT_EXTRACT_VECTOR_ELT29]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT19:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT18]], [[AIE_SEXT_EXTRACT_VECTOR_ELT28]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT20:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT19]], [[AIE_SEXT_EXTRACT_VECTOR_ELT27]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT21:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT20]], [[AIE_SEXT_EXTRACT_VECTOR_ELT26]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT22:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT21]], [[AIE_SEXT_EXTRACT_VECTOR_ELT25]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT23:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT22]], [[AIE_SEXT_EXTRACT_VECTOR_ELT24]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT24:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT23]], [[AIE_SEXT_EXTRACT_VECTOR_ELT23]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT25:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT24]], [[AIE_SEXT_EXTRACT_VECTOR_ELT22]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT26:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT25]], [[AIE_SEXT_EXTRACT_VECTOR_ELT21]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT27:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT26]], [[AIE_SEXT_EXTRACT_VECTOR_ELT20]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT28:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT27]], [[AIE_SEXT_EXTRACT_VECTOR_ELT19]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT29:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT28]], [[AIE_SEXT_EXTRACT_VECTOR_ELT18]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT30:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT29]], [[AIE_SEXT_EXTRACT_VECTOR_ELT17]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT31:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT30]], [[AIE_SEXT_EXTRACT_VECTOR_ELT16]](s32) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[AIE_ADD_VECTOR_ELT_LEFT15]](<16 x s32>), [[AIE_ADD_VECTOR_ELT_LEFT31]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<32 x s32>) + %0:_(<16 x s32>) = G_IMPLICIT_DEF + %1:_(<32 x s32>) = G_SHUFFLE_VECTOR %0(<16 x s32>), %0(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + PseudoRET implicit $lr, implicit %1 +... + +# ShuffleVec also concatenates constants into vectors +--- +name: test_shuffle_vec_16_to_32 +body: | + bb.0: + ; CHECK-LABEL: name: test_shuffle_vec_16_to_32 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C2]], [[C1]] + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32) + ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[BITCAST]](<2 x s16>) + %0:_(s16) = G_CONSTANT i16 8 + %1:_(s16) = G_CONSTANT i16 16 + %2:_(<2 x s16>) = G_SHUFFLE_VECTOR %0(s16), %1(s16), shufflemask(1, 0) + PseudoRET implicit $lr, implicit %2 +--- + +... +# AArch64 supports transforming the same type dst and src, so we do here as well. It has a different +# behaviour in that it enlarges the size of the destination vector, shuffles the bigger vector and +# throws out the higher order numbers. This is similar to AIE's ShuffleVec instruction. + +--- +name: test_canonised_256_to_512_32bit +body: | + bb.0: + ; CHECK-LABEL: name: test_canonised_256_to_512_32bit + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C]](s32) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT1:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C1]](s32) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT2:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C2]](s32) + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT3:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C3]](s32) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT4:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C4]](s32) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT5:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C5]](s32) + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT6:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C6]](s32) + ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK-NEXT: [[AIE_SEXT_EXTRACT_VECTOR_ELT7:%[0-9]+]]:_(s32) = G_AIE_SEXT_EXTRACT_VECTOR_ELT [[DEF]](<8 x s32>), [[C7]](s32) + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[DEF1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT7]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT1:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT]], [[AIE_SEXT_EXTRACT_VECTOR_ELT6]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT2:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT1]], [[AIE_SEXT_EXTRACT_VECTOR_ELT5]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT3:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT2]], [[AIE_SEXT_EXTRACT_VECTOR_ELT4]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT4:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT3]], [[AIE_SEXT_EXTRACT_VECTOR_ELT3]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT5:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT4]], [[AIE_SEXT_EXTRACT_VECTOR_ELT2]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT6:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT5]], [[AIE_SEXT_EXTRACT_VECTOR_ELT1]](s32) + ; CHECK-NEXT: [[AIE_ADD_VECTOR_ELT_LEFT7:%[0-9]+]]:_(<16 x s32>) = G_AIE_ADD_VECTOR_ELT_LEFT [[AIE_ADD_VECTOR_ELT_LEFT6]], [[AIE_SEXT_EXTRACT_VECTOR_ELT]](s32) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[AIE_ADD_VECTOR_ELT_LEFT7]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %0:_(<8 x s32>) = G_IMPLICIT_DEF + %1:_(<8 x s32>) = G_IMPLICIT_DEF + %2:_(<8 x s32>) = G_SHUFFLE_VECTOR %0(<8 x s32>), %0(<8 x s32>), shufflemask(0, 1, 2, 3, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %2 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir new file mode 100644 index 000000000000..5089018c5a9c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -0,0 +1,853 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-prelegalizer-combiner %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: concat_vector_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_1024 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: concat_vector_32_1024 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY]](<16 x s32>), [[COPY1]](<16 x s32>) + ; CHECK-NEXT: $y2 = COPY [[CONCAT_VECTORS]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $y2 + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<32 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $y2 = COPY %0:_(<32 x s32>) + PseudoRET implicit $lr, implicit $y2 +... + +--- +name: concat_vector_32_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0 + ; CHECK-LABEL: name: concat_vector_32_256 + ; CHECK: liveins: $wl0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[UV1]](<4 x s32>), [[UV]](<4 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0 + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<4 x s32>), %3:_(<4 x s32>) = G_UNMERGE_VALUES %1:_(<8 x s32>) + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %3:_(<4 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $wl0 +... + +--- +name: concat_vector_16_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_16_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s16>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s16>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<16 x s16>), [[COPY1]](<16 x s16>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<16 x s16>) = COPY $wl2 + %2:_(<16 x s16>) = COPY $wl4 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<16 x s16>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $x0 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_8_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_8_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s8>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s8>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s8>) = COPY $wl2 + %2:_(<32 x s8>) = COPY $wl4 + %0:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<32 x s8>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + $x0 = COPY %0:_(<64 x s8>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_512 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: $x0 = COPY [[UV]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_256 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, -1, 2, -1, 4, -1, -1, 7, 8, 9, -1, 11, 12, -1, 14, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_128 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_32 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s8>), [[UV9:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[UV6]](<4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV8]](<2 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %0:_(<2 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_, shufflemask(0, 1) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_second_half_512_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_256 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_512_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_128 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV3]](<4 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_512 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<64 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_32 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV7]](<4 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_third_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_third_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_third_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_third_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_third_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(4, 5) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_fourth_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_fourth_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(24,25,26,27,28,29,30,31) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_fourth_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(12,13,14,15) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_fourth_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(6,7) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: insert_vector_16_elements +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_16_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements_reverse + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV2]](<8 x s32>), [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements_reverse + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[AIE_UNPAD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(8, 9, 10, 11, 0, 1, 2, 3) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements_reverse + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV2]](<64 x s8>), [[UV]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: concat_vector_reverse_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_undef_start_first +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_undef_start_first + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_start_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_start_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, -1, 11, 12, 13, -1, 15, 0, 1, -1, 3, 4, 5, -1, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll new file mode 100644 index 000000000000..0284bbbe9d7f --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; +; This file is licensed under the Apache License v2.0 with LLVM Exceptions. +; See https://llvm.org/LICENSE.txt for license information. +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; +; (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +; RUN: llc -O2 -mtriple=aie2 -verify-machineinstrs --issue-limit=1 %s -o - | FileCheck %s + +define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { +; CHECK-LABEL: test_extract_vector: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB0_2; nopv +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: // %bb.1: // %if.end +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wl0, wh0; nopv +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %cmp = icmp eq i32 %idx, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %shuffle = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + br label %return + +if.end: + %shuffle1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> + br label %return + +return: + %retval.0 = phi <8 x i32> [ %shuffle, %if.then ], [ %shuffle1, %if.end ] + ret <8 x i32> %retval.0 +} + +define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, <8 x i32> noundef %b) { +; CHECK-LABEL: test_insert_vector: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1 +; CHECK-NEXT: // %bb.1: // %if.end +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB1_2: // %if.then +; CHECK-NEXT: ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> + %cmp = icmp eq i32 %idx, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %shuffle1 = shufflevector <16 x i32> %shuffle, <16 x i32> %a, <16 x i32> + br label %cleanup + +if.end: ; + %shuffle2 = shufflevector <16 x i32> %a, <16 x i32> %shuffle, <16 x i32> + br label %cleanup + +cleanup: + %retval.0 = phi <16 x i32> [ %shuffle1, %if.then ], [ %shuffle2, %if.end ] + ret <16 x i32> %retval.0 +} + +define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b) { +; CHECK-LABEL: test_concat_vector: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 +entry: + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + ret <16 x i32> %shuffle +} + +define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) { +; CHECK-LABEL: test_set_vector: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: mov r1, r16 +; CHECK-NEXT: eqz r0, r0 +; CHECK-NEXT: ret lr +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4 +; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3 +; CHECK-NEXT: vsel.32 x0, x0, x0, r16 // Delay Slot 2 +; CHECK-NEXT: mov r16, r1 // Delay Slot 1 +entry: + %cmp = icmp eq i32 %idx, 0 + %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> + %shuffle1 = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> + %retval.0 = select i1 %cmp, <16 x i32> %shuffle, <16 x i32> %shuffle1 + ret <16 x i32> %retval.0 +} + +define i32 @test_extract_elem(<8 x i32> noundef %a, i32 noundef %idx) { +; CHECK-LABEL: test_extract_elem: +; CHECK: .p2align 4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: nopa ; nopb ; ret lr ; nopm ; nops +; CHECK-NEXT: mov r2, r16 // Delay Slot 5 +; CHECK-NEXT: mov r16, r1 // Delay Slot 4 +; CHECK-NEXT: vextract.s32 r0, x0, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: mov r16, r2 // Delay Slot 1 +entry: + %vecext = extractelement <8 x i32> %a, i32 %idx + ret i32 %vecext +}