diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt index aee62c82f4..7f7d608070 100644 --- a/host/lib/convert/CMakeLists.txt +++ b/host/lib/convert/CMakeLists.txt @@ -9,29 +9,51 @@ # This file included, use CMake directory variables ######################################################################## include(CheckIncludeFileCXX) +include(CheckCXXCompilerFlag) message(STATUS "") ######################################################################## -# Check for SSE2 SIMD headers +# Check for SIMD headers ######################################################################## + +# Check for SSE2 support +check_cxx_compiler_flag("-msse2" SSE2_SUPPORTED) +if(SSE2_SUPPORTED) + message(STATUS "SSE2 is supported") +endif(SSE2_SUPPORTED) + +# Check for SSE3 support +check_cxx_compiler_flag("-msse3" SSE3_SUPPORTED) +if(SSE3_SUPPORTED) + message(STATUS "SSE3 is supported") + set(SSE2_SUPPORTED OFF) +endif(SSE3_SUPPORTED) + +# Check for AVX2 support +check_cxx_compiler_flag("-mavx2" AVX2_SUPPORTED) +# set(AVX2_SUPPORTED OFF) +if(AVX2_SUPPORTED) + message(STATUS "AVX2 is supported") + # set(SSE3_SUPPORTED OFF) +endif(AVX2_SUPPORTED) + +# Check for AVX2 support +check_cxx_compiler_flag("-mavx512" AVX512_SUPPORTED) +if(AVX512_SUPPORTED) + message(STATUS "AVX512 is supported") + set(AVX2_SUPPORTED OFF) +endif(AVX512_SUPPORTED) + if(CMAKE_COMPILER_IS_GNUCXX) - set(EMMINTRIN_FLAGS -msse2) - set(TMMINTRIN_FLAGS -mssse3) + set(SSE2_FLAGS -msse2) + set(SSE3_FLAGS -mssse3) + set(AVX2_FLAGS -mavx2) + set(AVX512_FLAGS -mavx512) elseif(MSVC) - set(EMMINTRIN_FLAGS /arch:SSE2) + set(SSE2_FLAGS /arch:SSE2) endif() -set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS}) -CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H) -unset(CMAKE_REQUIRED_FLAGS) - -if(ENABLE_SSSE3) -set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS}) -CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H) -unset(CMAKE_REQUIRED_FLAGS) -endif(ENABLE_SSSE3) - -if(HAVE_EMMINTRIN_H) +if(SSE2_SUPPORTED) set(convert_with_sse2_sources ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp @@ -45,22 +67,41 @@ if(HAVE_EMMINTRIN_H) ) set_source_files_properties( ${convert_with_sse2_sources} - PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}" + PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}" ) LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources}) -endif(HAVE_EMMINTRIN_H) +endif(SSE2_SUPPORTED) -if(HAVE_TMMINTRIN_H) +if(SSE3_SUPPORTED) set(convert_with_ssse3_sources ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp ) set_source_files_properties( ${convert_with_ssse3_sources} - PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}" + PROPERTIES COMPILE_FLAGS "${SSE3_FLAGS}" ) LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources}) -endif(HAVE_TMMINTRIN_H) +endif(SSE3_SUPPORTED) + +if(AVX2_SUPPORTED) + set(convert_with_avx2_sources + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp # AVX2 conversion is not efficient as SSE2 for this case + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp + ) + set_source_files_properties( + ${convert_with_avx2_sources} + PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS} ${SSE2_FLAGS}" + ) + LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources}) +endif(AVX2_SUPPORTED) ######################################################################## # Check for NEON SIMD headers diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp new file mode 100644 index 0000000000..bf18c41f13 --- /dev/null +++ b/host/lib/convert/avx2_fc32_to_sc16.cpp @@ -0,0 +1,193 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Brand +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256 tmplo = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 0)); \ + __m256 tmphi = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 4)); \ + \ + /* convert and scale */ \ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ + \ + __m256i shuffled_lo = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ + __m256i shuffled_hi = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ + \ + /* now pack the shuffled data sequentially */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + \ + /* pack + swap 16-bit pairs */ \ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_fc32_1_to_item32_1_nswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + xx_to_item32_sc16(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_fc32_1_to_item32_1_nswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + convert_fc32_1_to_item32_1_nswap_guts(u_) + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + +// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a +// time +#define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256 tmplo = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 0)); \ + __m256 tmphi = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 4)); \ + \ + /* convert and scale */ \ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ + \ + __m256i shuffled_lo = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ + __m256i shuffled_hi = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ + \ + /* Now pack the shuffled data sequentially */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + \ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_fc32_1_to_item32_1_bswap_guts(_) break; + case 0x8: + // the first value is 8-byte aligned - process it and prepare the bulk of the + // data for fast conversion + xx_to_item32_sc16(input, output, 1, scale_factor); + i++; + // do faster processing of the remaining samples now that we are 16-byte + // aligned + convert_fc32_1_to_item32_1_bswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + convert_fc32_1_to_item32_1_bswap_guts(u_) + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_fc32_1_to_item32_1_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256 tmplo = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 0)); \ + __m256 tmphi = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 4)); \ + \ + /* convert and scale */ \ + __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar)); \ + __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar)); \ + \ + /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \ + __m256i shuffled_lo = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */ \ + __m256i shuffled_hi = _mm256_permute2x128_si256( \ + tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */ \ + \ + /* Now pack the shuffled data sequentially */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_fc32_1_to_item32_1_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + xx_to_chdr_sc16(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_fc32_1_to_item32_1_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + convert_fc32_1_to_item32_1_guts(u_) + } + + // convert any remaining samples + xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc32_to_sc8.cpp b/host/lib/convert/avx2_fc32_to_sc8.cpp new file mode 100644 index 0000000000..60a0407e37 --- /dev/null +++ b/host/lib/convert/avx2_fc32_to_sc8.cpp @@ -0,0 +1,123 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +template +UHD_INLINE __m256i pack_sc32_4x(const __m256& in0, + const __m256& in1, + const __m256& in2, + const __m256& in3, + const __m256& scalar) +{ + __m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar)); + tmpi0 = _mm256_shuffle_epi32(tmpi0, shuf); + __m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar)); + tmpi1 = _mm256_shuffle_epi32(tmpi1, shuf); + + __m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20); + __m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31); + const __m256i lo = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi); + + __m256i tmpi2 = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar)); + tmpi2 = _mm256_shuffle_epi32(tmpi2, shuf); + __m256i tmpi3 = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar)); + tmpi3 = _mm256_shuffle_epi32(tmpi3, shuf); + + __m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20); + __m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31); + const __m256i hi = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi); + + __m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20); + __m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31); + + return _mm256_packs_epi16(shuf_lo, shuf_hi); +} + +DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + +#define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_) \ + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { \ + /* load from input */ \ + __m256 tmp0 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 0)); \ + __m256 tmp1 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 4)); \ + __m256 tmp2 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 8)); \ + __m256 tmp3 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 12)); \ + \ + /* convert */ \ + const __m256i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); \ + } + + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc32_1_to_sc8_item32_1_bswap_guts(_) + } else { + convert_fc32_1_to_sc8_item32_1_bswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ + const fc32_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor)); + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + +#define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_) \ + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { \ + /* load from input */ \ + __m256 tmp0 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 0)); \ + __m256 tmp1 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 4)); \ + __m256 tmp2 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 8)); \ + __m256 tmp3 = \ + _mm256_load##_al_##ps(reinterpret_cast(input + i + 12)); \ + \ + /* convert */ \ + const __m256i tmpi = pack_sc32_4x(tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc32_1_to_sc8_item32_1_nswap_guts(_) + } else { + convert_fc32_1_to_sc8_item32_1_nswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc64_to_sc16.cpp b/host/lib/convert/avx2_fc64_to_sc16.cpp new file mode 100644 index 0000000000..fde67ef01f --- /dev/null +++ b/host/lib/convert/avx2_fc64_to_sc16.cpp @@ -0,0 +1,179 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + +#define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256d tmp0 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 0)); \ + __m256d tmp1 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 2)); \ + __m256d tmp2 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 4)); \ + __m256d tmp3 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 6)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); \ + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); \ + \ + /* Unpack and interleave the results */ \ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); \ + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); \ + \ + /* BEFORE PACKS */ \ + /* Pack and swap 16-bit pairs */ \ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); \ + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); \ + \ + /* pack + swap 16-bit pairs */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + \ + /* pack + swap 16-bit pairs */ \ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc64_1_to_item32_1_nswap_guts(_) + } else { + convert_fc64_1_to_item32_1_nswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + +#define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256d tmp0 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 0)); \ + __m256d tmp1 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 2)); \ + __m256d tmp2 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 4)); \ + __m256d tmp3 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 6)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); \ + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); \ + \ + \ + /* Unpack and interleave the results */ \ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); \ + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); \ + \ + /* BEFORE PACKS */ \ + /* Pack and swap 16-bit pairs */ \ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); \ + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); \ + \ + /* pack + swap 16-bit pairs */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc64_1_to_item32_1_bswap_guts(_) + } else { + convert_fc64_1_to_item32_1_bswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc16(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc16_chdr, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + +#define convert_fc64_1_to_chdr_1_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256d tmp0 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 0)); \ + __m256d tmp1 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 2)); \ + __m256d tmp2 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 4)); \ + __m256d tmp3 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 6)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar)); \ + __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar)); \ + \ + /* Unpack and interleave the results */ \ + __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0); \ + __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2); \ + \ + /* BEFORE PACKS */ \ + /* Pack and swap 16-bit pairs */ \ + __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); \ + __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); \ + \ + /* pack + swap 16-bit pairs */ \ + __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc64_1_to_chdr_1_guts(_) + } else { + convert_fc64_1_to_chdr_1_guts(u_) + } + + // convert remainder + xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_fc64_to_sc8.cpp b/host/lib/convert/avx2_fc64_to_sc8.cpp new file mode 100644 index 0000000000..2f31d0cf8c --- /dev/null +++ b/host/lib/convert/avx2_fc64_to_sc8.cpp @@ -0,0 +1,135 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +UHD_INLINE __m256i pack_sc8_item32_4x( + const __m256i& in0, const __m256i& in1, const __m256i& in2, const __m256i& in3) +{ + const __m256i shuffled_in0_lo = _mm256_permute2x128_si256(in0, in1, 0x20); + const __m256i shuffled_in0_hi = _mm256_permute2x128_si256(in0, in1, 0x31); + const __m256i shuffled_in1_lo = _mm256_permute2x128_si256(in2, in3, 0x20); + const __m256i shuffled_in1_hi = _mm256_permute2x128_si256(in2, in3, 0x31); + + const __m256i lo = _mm256_packs_epi32(shuffled_in0_lo, shuffled_in0_hi); + const __m256i hi = _mm256_packs_epi32(shuffled_in1_lo, shuffled_in1_hi); + return _mm256_packs_epi16(lo, hi); +} + +UHD_INLINE __m256i pack_sc32_4x( + const __m256d& lo, const __m256d& hi, const __m256d& scalar) +{ + const __m128i tmpi_lo = _mm256_cvttpd_epi32(_mm256_mul_pd(hi, scalar)); + const __m128i tmpi_hi = _mm256_cvttpd_epi32(_mm256_mul_pd(lo, scalar)); + + return _mm256_set_m128i(tmpi_hi, tmpi_lo); +} + +DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + +#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_) \ + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { \ + /* load from input */ \ + __m256d tmp0 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 0)); \ + __m256d tmp1 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 2)); \ + __m256d tmp2 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 4)); \ + __m256d tmp3 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 6)); \ + __m256d tmp4 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 8)); \ + __m256d tmp5 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 10)); \ + __m256d tmp6 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 12)); \ + __m256d tmp7 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 14)); \ + \ + /* interleave */ \ + const __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar), \ + pack_sc32_4x(tmp3, tmp2, scalar), \ + pack_sc32_4x(tmp5, tmp4, scalar), \ + pack_sc32_4x(tmp7, tmp6, scalar)); \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc64_1_to_sc8_item32_1_bswap_guts(_) + } else { + convert_fc64_1_to_sc8_item32_1_bswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD) +{ + const fc64_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor); + +#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_) \ + for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) { \ + /* load from input */ \ + __m256d tmp0 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 0)); \ + __m256d tmp1 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 2)); \ + __m256d tmp2 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 4)); \ + __m256d tmp3 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 6)); \ + __m256d tmp4 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 8)); \ + __m256d tmp5 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 10)); \ + __m256d tmp6 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 12)); \ + __m256d tmp7 = \ + _mm256_load##_al_##pd(reinterpret_cast(input + i + 14)); \ + \ + /* interleave */ \ + __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar), \ + pack_sc32_4x(tmp2, tmp3, scalar), \ + pack_sc32_4x(tmp4, tmp5, scalar), \ + pack_sc32_4x(tmp6, tmp7, scalar)); \ + tmpi = _mm256_or_si256( \ + _mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); /*byteswap*/ \ + \ + /* store to output */ \ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(input) & 0xf) == 0) { + convert_fc64_1_to_sc8_item32_1_nswap_guts(_) + } else { + convert_fc64_1_to_sc8_item32_1_nswap_guts(u_) + } + + // convert remainder + xx_to_item32_sc8(input + i, output + (i / 2), nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_sc16_to_fc32.cpp b/host/lib/convert/avx2_sc16_to_fc32.cpp new file mode 100644 index 0000000000..509d7583c3 --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_fc32.cpp @@ -0,0 +1,194 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = \ + _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + __m256i tmpilo = \ + _mm256_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m256i shuffled_lo = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit */ \ + __m256i shuffled_hi = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit */ \ + \ + /* convert and scale */ \ + __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar); \ + __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), tmplo); \ + _mm256_storeu_ps(reinterpret_cast(output + i + 4), tmphi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_item32_1_to_fc32_1_nswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_item32_1_to_fc32_1_nswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + convert_item32_1_to_fc32_1_nswap_guts(u_) + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ + \ + __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi); \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m256i shuffled_lo = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit */ \ + __m256i shuffled_hi = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit */ \ + \ + /* convert and scale */ \ + __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar); \ + __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), tmplo); \ + _mm256_storeu_ps(reinterpret_cast(output + i + 4), tmphi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_item32_1_to_fc32_1_bswap_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_item32_1_to_fc32_1_bswap_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + convert_item32_1_to_fc32_1_bswap_guts(u_) + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_chdr, 1, fc32, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time +#define convert_item32_1_to_fc32_1_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = \ + _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + __m256i tmpilo = \ + _mm256_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m256i shuffled_lo = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit */ \ + __m256i shuffled_hi = \ + _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit */ \ + \ + /* convert and scale */ \ + __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar); \ + __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_ps(reinterpret_cast(output + i + 0), tmplo); \ + _mm256_storeu_ps(reinterpret_cast(output + i + 4), tmphi); \ + } + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + convert_item32_1_to_fc32_1_guts(_) break; + case 0x8: + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + chdr_sc16_to_xx(input, output, 1, scale_factor); + i++; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + convert_item32_1_to_fc32_1_guts(_) break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + convert_item32_1_to_fc32_1_guts(u_) + } + + // convert any remaining samples + chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_sc16_to_fc64.cpp b/host/lib/convert/avx2_sc16_to_fc64.cpp new file mode 100644 index 0000000000..c3e9e3371a --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_fc64.cpp @@ -0,0 +1,165 @@ +// +// Copyright 2011-2012 Ettus Research LLC +// Copyright 2018 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +#define convert_item32_1_to_fc64_1_nswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi); \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo); \ + __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1); \ + __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi); \ + __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1); \ + \ + /* convert and scale */ \ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar); \ + tmpilo = _mm256_unpackhi_epi64(tmpilo, zeroi); \ + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar); \ + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar); \ + tmpihi = _mm256_unpackhi_epi64(tmpihi, zeroi); \ + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { + convert_item32_1_to_fc64_1_nswap_guts(_) + } else { + convert_item32_1_to_fc64_1_nswap_guts(u_) + } + + // convert remainder + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +#define convert_item32_1_to_fc64_1_bswap_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ + __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi); \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo); \ + __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1); \ + __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi); \ + __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1); \ + \ + /* convert and scale */ \ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar); \ + tmpilo = _mm256_unpackhi_epi64(tmpilo, zeroi); \ + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar); \ + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar); \ + tmpihi = _mm256_unpackhi_epi64(tmpihi, zeroi); \ + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { + convert_item32_1_to_fc64_1_bswap_guts(_) + } else { + convert_item32_1_to_fc64_1_bswap_guts(u_) + } + + // convert remainder + item32_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} + +DECLARE_CONVERTER(sc16_chdr, 1, fc64, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + fc64_t* output = reinterpret_cast(outputs[0]); + + const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16)); + const __m256i zeroi = _mm256_setzero_si256(); + +#define convert_chdr_1_to_fc64_1_guts(_al_) \ + for (; i + 7 < nsamps; i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack 16-bit pairs */ \ + __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi); \ + __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi); \ + \ + __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo); \ + __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1); \ + __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi); \ + __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1); \ + \ + /* convert and scale */ \ + __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar); \ + tmpilo = _mm256_unpackhi_epi64(tmpilo, zeroi); \ + __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar); \ + __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar); \ + tmpihi = _mm256_unpackhi_epi64(tmpihi, zeroi); \ + __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar); \ + \ + /* store to output */ \ + _mm256_storeu_pd(reinterpret_cast(output + i + 0), tmp0); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 2), tmp1); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 4), tmp2); \ + _mm256_storeu_pd(reinterpret_cast(output + i + 6), tmp3); \ + } + + size_t i = 0; + + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { + convert_chdr_1_to_fc64_1_guts(_) + } else { + convert_chdr_1_to_fc64_1_guts(u_) + } + + // convert remainder + chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor); +} diff --git a/host/lib/convert/avx2_sc16_to_sc16.cpp b/host/lib/convert/avx2_sc16_to_sc16.cpp new file mode 100644 index 0000000000..62fcb1455e --- /dev/null +++ b/host/lib/convert/avx2_sc16_to_sc16.cpp @@ -0,0 +1,210 @@ +// +// Copyright 2024 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +// +// SSE 16-bit pair swap +// +// Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned +// access respectively. Macro operates on 4 complex 16-bit integers at a time. +// +// ----------------- +// | A | B | C | D | Input +// ----------------- +// 0 1 2 3 Address +// ----------------- +// | C | D | A | B | Output +// ----------------- +// +#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_, _oalign_) \ + for (; i + 7 < nsamps; i += 8) { \ + __m256i m0; \ + \ + /* load from input */ \ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); \ + \ + /* swap 16-bit pairs */ \ + m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ + m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm256_storeu_si256((__m256i*)(output + i), m0); \ + } + +// +// SSE byte swap +// +// Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned +// access respectively. Macro operates on 4 complex 16-bit integers at a time. +// +// ----------------- +// | A | B | C | D | Input +// ----------------- +// 0 1 2 3 Address +// ----------------- +// | B | A | D | C | Output +// ----------------- +// +#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_, _oalign_) \ + for (; i + 7 < nsamps; i += 8) { \ + __m256i m0, m1, m2; \ + \ + /* load from input */ \ + m0 = _mm256_loadu_si256((const __m256i*)(input + i)); \ + \ + /* byteswap 16 bit words */ \ + m1 = _mm256_srli_epi16(m0, 8); \ + m2 = _mm256_slli_epi16(m0, 8); \ + m0 = _mm256_or_si256(m1, m2); \ + \ + /* store to output */ \ + _mm256_storeu_si256((__m256i*)(output + i), m0); \ + } + +DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + xx_to_item32_sc16(input, output, 2, 1.0); + i += 2; + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_) + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_) + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD) +{ + const sc16_t* input = reinterpret_cast(inputs[0]); + item32_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(input) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_) + break; + case 0x8: + if (nsamps < 2) + break; + // the first value is 8-byte aligned - process it and prepare the bulk of the + // data for fast conversion + xx_to_item32_sc16(input, output, 2, 1.0); + i += 2; + // do faster processing of the remaining samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_) + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_) + } + + // convert any remaining samples + xx_to_item32_sc16(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx(input, output, 2, 1.0); + i += 2; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _) + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_) + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, 1.0); +} + +DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(inputs[0]); + sc16_t* output = reinterpret_cast(outputs[0]); + + size_t i = 0; + + // need to dispatch according to alignment for fastest conversion + switch (size_t(output) & 0xf) { + case 0x0: + // the data is 16-byte aligned, so do the fast processing of the bulk of the + // samples + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _) + break; + case 0x8: + if (nsamps < 2) + break; + // the first sample is 8-byte aligned - process it to align the remainder of + // the samples to 16-bytes + item32_sc16_to_xx(input, output, 2, 1.0); + i += 2; + // do faster processing of the bulk of the samples now that we are 16-byte + // aligned + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _) + break; + default: + // we are not 8 or 16-byte aligned, so do fast processing with the unaligned + // load and store + CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_) + } + + // convert any remaining samples + item32_sc16_to_xx(input + i, output + i, nsamps - i, 1.0); +} diff --git a/host/lib/convert/avx2_sc8_to_fc32.cpp b/host/lib/convert/avx2_sc8_to_fc32.cpp new file mode 100644 index 0000000000..2d73444e34 --- /dev/null +++ b/host/lib/convert/avx2_sc8_to_fc32.cpp @@ -0,0 +1,122 @@ +// +// Copyright 2012-2013 Ettus Research LLC +// Copyright 2018 Ettus Research, a National Instruments Company +// +// SPDX-License-Identifier: GPL-3.0-or-later +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +static const __m256i zeroi = _mm256_setzero_si256(); + +template +UHD_INLINE void unpack_sc32_4x(const __m256i& in, + __m256& out0, + __m256& out1, + __m256& out2, + __m256& out3, + const __m256& scalar) +{ + const __m256i tmplo = _mm256_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */ + __m256i tmp0 = _mm256_shuffle_epi32( + _mm256_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */ + __m256i tmp1 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmplo), shuf); + out0 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp0), scalar); + out1 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp1), scalar); + + const __m256i tmphi = _mm256_unpackhi_epi8(zeroi, in); + __m256i tmp2 = _mm256_shuffle_epi32(_mm256_unpacklo_epi16(zeroi, tmphi), shuf); + __m256i tmp3 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmphi), shuf); + out2 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp2), scalar); + out3 = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp3), scalar); +} + +DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(3, 2, 1, 0); + + size_t i = 0, j = 0; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0) { + item32_sc8_to_xx(input++, output++, 1, scale_factor); + num_samps--; + } + +#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; j + 15 < num_samps; j += 16, i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m256 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm256_storeu_ps(reinterpret_cast(output + j + 0), tmp0); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 4), tmp1); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 8), tmp2); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 12), tmp3); \ + } + + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { + convert_sc8_item32_1_to_fc32_1_bswap_guts(_) + } else { + convert_sc8_item32_1_to_fc32_1_bswap_guts(u_) + } + + // convert remainder + item32_sc8_to_xx(input + i, output + j, num_samps - j, scale_factor); +} + +DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD) +{ + const item32_t* input = reinterpret_cast(size_t(inputs[0]) & ~0x3); + fc32_t* output = reinterpret_cast(outputs[0]); + + const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24)); + const int shuf = _MM_SHUFFLE(0, 1, 2, 3); + + size_t i = 0, j = 0; + size_t num_samps = nsamps; + + if ((size_t(inputs[0]) & 0x3) != 0) { + item32_sc8_to_xx(input++, output++, 1, scale_factor); + num_samps--; + } + +#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; j + 15 < num_samps; j += 16, i += 8) { \ + /* load from input */ \ + __m256i tmpi = _mm256_loadu_si256(reinterpret_cast(input + i)); \ + \ + /* unpack + swap 8-bit pairs */ \ + __m256 tmp0, tmp1, tmp2, tmp3; \ + unpack_sc32_4x(tmpi, tmp0, tmp1, tmp2, tmp3, scalar); \ + \ + /* store to output */ \ + _mm256_storeu_ps(reinterpret_cast(output + j + 0), tmp0); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 4), tmp1); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 8), tmp2); \ + _mm256_storeu_ps(reinterpret_cast(output + j + 12), tmp3); \ + } + + // dispatch according to alignment + if ((size_t(output) & 0xf) == 0) { + convert_sc8_item32_1_to_fc32_1_nswap_guts(_) + } else { + convert_sc8_item32_1_to_fc32_1_nswap_guts(u_) + } + + // convert remainder + item32_sc8_to_xx(input + i, output + j, num_samps - j, scale_factor); +} diff --git a/host/tests/convert_test.cpp b/host/tests/convert_test.cpp index 7fd3f2564f..fd71f9569b 100644 --- a/host/tests/convert_test.cpp +++ b/host/tests/convert_test.cpp @@ -1080,7 +1080,7 @@ BOOST_AUTO_TEST_CASE(test_convert_types_fc32_and_sc8) } } -BOOST_TEST_DECORATOR(*boost::unit_test::disabled()) +// BOOST_TEST_DECORATOR(*boost::unit_test::disabled()) BOOST_AUTO_TEST_CASE(benchmark_convert_types_fc32_and_sc8) { SKIP_BENCHMARK_CHECK;