|  | 
|  | 1 | +// | 
|  | 2 | +// Copyright 2024 Ettus Research, a National Instruments Brand | 
|  | 3 | +// | 
|  | 4 | +// SPDX-License-Identifier: GPL-3.0-or-later | 
|  | 5 | +// | 
|  | 6 | + | 
|  | 7 | +#include "convert_common.hpp" | 
|  | 8 | +#include <uhd/utils/byteswap.hpp> | 
|  | 9 | +#include <immintrin.h> | 
|  | 10 | + | 
|  | 11 | +using namespace uhd::convert; | 
|  | 12 | + | 
|  | 13 | +DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD) | 
|  | 14 | +{ | 
|  | 15 | +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); | 
|  | 16 | +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]); | 
|  | 17 | + | 
|  | 18 | +    const __m256 scalar = _mm256_set1_ps(float(scale_factor)); | 
|  | 19 | + | 
|  | 20 | +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time | 
|  | 21 | +#define convert_fc32_1_to_item32_1_nswap_guts(_al_)                               \ | 
|  | 22 | +    for (; i + 7 < nsamps; i += 8) {                                              \ | 
|  | 23 | +        /* load from input */                                                     \ | 
|  | 24 | +        __m256 tmplo =                                                            \ | 
|  | 25 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \ | 
|  | 26 | +        __m256 tmphi =                                                            \ | 
|  | 27 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \ | 
|  | 28 | +                                                                                  \ | 
|  | 29 | +        /* convert and scale */                                                   \ | 
|  | 30 | +        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));        \ | 
|  | 31 | +        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));        \ | 
|  | 32 | +                                                                                  \ | 
|  | 33 | +        __m256i shuffled_lo = _mm256_permute2x128_si256(                          \ | 
|  | 34 | +            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */       \ | 
|  | 35 | +        __m256i shuffled_hi = _mm256_permute2x128_si256(                          \ | 
|  | 36 | +            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */       \ | 
|  | 37 | +                                                                                  \ | 
|  | 38 | +        /* now pack the shuffled data sequentially */                             \ | 
|  | 39 | +        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);              \ | 
|  | 40 | +                                                                                  \ | 
|  | 41 | +        /* pack + swap 16-bit pairs */                                            \ | 
|  | 42 | +        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \ | 
|  | 43 | +        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \ | 
|  | 44 | +                                                                                  \ | 
|  | 45 | +        /* store to output */                                                     \ | 
|  | 46 | +        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);        \ | 
|  | 47 | +    } | 
|  | 48 | + | 
|  | 49 | +    size_t i = 0; | 
|  | 50 | + | 
|  | 51 | +    // need to dispatch according to alignment for fastest conversion | 
|  | 52 | +    switch (size_t(input) & 0xf) { | 
|  | 53 | +        case 0x0: | 
|  | 54 | +            // the data is 16-byte aligned, so do the fast processing of the bulk of the | 
|  | 55 | +            // samples | 
|  | 56 | +            convert_fc32_1_to_item32_1_nswap_guts(_) break; | 
|  | 57 | +        case 0x8: | 
|  | 58 | +            // the first sample is 8-byte aligned - process it to align the remainder of | 
|  | 59 | +            // the samples to 16-bytes | 
|  | 60 | +            xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor); | 
|  | 61 | +            i++; | 
|  | 62 | +            // do faster processing of the bulk of the samples now that we are 16-byte | 
|  | 63 | +            // aligned | 
|  | 64 | +            convert_fc32_1_to_item32_1_nswap_guts(_) break; | 
|  | 65 | +        default: | 
|  | 66 | +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned | 
|  | 67 | +            // load | 
|  | 68 | +            convert_fc32_1_to_item32_1_nswap_guts(u_) | 
|  | 69 | +    } | 
|  | 70 | + | 
|  | 71 | +    // convert any remaining samples | 
|  | 72 | +    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor); | 
|  | 73 | +} | 
|  | 74 | + | 
|  | 75 | +DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD) | 
|  | 76 | +{ | 
|  | 77 | +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); | 
|  | 78 | +    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]); | 
|  | 79 | + | 
|  | 80 | +    const __m256 scalar = _mm256_set1_ps(float(scale_factor)); | 
|  | 81 | + | 
|  | 82 | +// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a | 
|  | 83 | +// time | 
|  | 84 | +#define convert_fc32_1_to_item32_1_bswap_guts(_al_)                                     \ | 
|  | 85 | +    for (; i + 7 < nsamps; i += 8) {                                                    \ | 
|  | 86 | +        /* load from input */                                                           \ | 
|  | 87 | +        __m256 tmplo =                                                                  \ | 
|  | 88 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));       \ | 
|  | 89 | +        __m256 tmphi =                                                                  \ | 
|  | 90 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));       \ | 
|  | 91 | +                                                                                        \ | 
|  | 92 | +        /* convert and scale */                                                         \ | 
|  | 93 | +        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));              \ | 
|  | 94 | +        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));              \ | 
|  | 95 | +                                                                                        \ | 
|  | 96 | +        __m256i shuffled_lo = _mm256_permute2x128_si256(                                \ | 
|  | 97 | +            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */             \ | 
|  | 98 | +        __m256i shuffled_hi = _mm256_permute2x128_si256(                                \ | 
|  | 99 | +            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */             \ | 
|  | 100 | +                                                                                        \ | 
|  | 101 | +        /* Now pack the shuffled data sequentially */                                   \ | 
|  | 102 | +        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);                    \ | 
|  | 103 | +                                                                                        \ | 
|  | 104 | +        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \ | 
|  | 105 | +                                                                                        \ | 
|  | 106 | +        /* store to output */                                                           \ | 
|  | 107 | +        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);              \ | 
|  | 108 | +    } | 
|  | 109 | + | 
|  | 110 | +    size_t i = 0; | 
|  | 111 | + | 
|  | 112 | +    // need to dispatch according to alignment for fastest conversion | 
|  | 113 | +    switch (size_t(input) & 0xf) { | 
|  | 114 | +        case 0x0: | 
|  | 115 | +            // the data is 16-byte aligned, so do the fast processing of the bulk of the | 
|  | 116 | +            // samples | 
|  | 117 | +            convert_fc32_1_to_item32_1_bswap_guts(_) break; | 
|  | 118 | +        case 0x8: | 
|  | 119 | +            // the first value is 8-byte aligned - process it and prepare the bulk of the | 
|  | 120 | +            // data for fast conversion | 
|  | 121 | +            xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor); | 
|  | 122 | +            i++; | 
|  | 123 | +            // do faster processing of the remaining samples now that we are 16-byte | 
|  | 124 | +            // aligned | 
|  | 125 | +            convert_fc32_1_to_item32_1_bswap_guts(_) break; | 
|  | 126 | +        default: | 
|  | 127 | +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned | 
|  | 128 | +            // load | 
|  | 129 | +            convert_fc32_1_to_item32_1_bswap_guts(u_) | 
|  | 130 | +    } | 
|  | 131 | + | 
|  | 132 | +    // convert any remaining samples | 
|  | 133 | +    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor); | 
|  | 134 | +} | 
|  | 135 | + | 
|  | 136 | +DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD) | 
|  | 137 | +{ | 
|  | 138 | +    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]); | 
|  | 139 | +    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]); | 
|  | 140 | + | 
|  | 141 | +    const __m256 scalar = _mm256_set1_ps(float(scale_factor)); | 
|  | 142 | + | 
|  | 143 | +// this macro converts values faster by using SSE intrinsics to convert 4 values at a time | 
|  | 144 | +#define convert_fc32_1_to_item32_1_guts(_al_)                                      \ | 
|  | 145 | +    for (; i + 7 < nsamps; i += 8) {                                               \ | 
|  | 146 | +        /* load from input */                                                      \ | 
|  | 147 | +        __m256 tmplo =                                                             \ | 
|  | 148 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));  \ | 
|  | 149 | +        __m256 tmphi =                                                             \ | 
|  | 150 | +            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));  \ | 
|  | 151 | +                                                                                   \ | 
|  | 152 | +        /* convert and scale */                                                    \ | 
|  | 153 | +        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));         \ | 
|  | 154 | +        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));         \ | 
|  | 155 | +                                                                                   \ | 
|  | 156 | +        /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \ | 
|  | 157 | +        __m256i shuffled_lo = _mm256_permute2x128_si256(                           \ | 
|  | 158 | +            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */        \ | 
|  | 159 | +        __m256i shuffled_hi = _mm256_permute2x128_si256(                           \ | 
|  | 160 | +            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */        \ | 
|  | 161 | +                                                                                   \ | 
|  | 162 | +        /* Now pack the shuffled data sequentially */                              \ | 
|  | 163 | +        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);               \ | 
|  | 164 | +                                                                                   \ | 
|  | 165 | +        /* store to output */                                                      \ | 
|  | 166 | +        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);         \ | 
|  | 167 | +    } | 
|  | 168 | + | 
|  | 169 | +    size_t i = 0; | 
|  | 170 | + | 
|  | 171 | +    // need to dispatch according to alignment for fastest conversion | 
|  | 172 | +    switch (size_t(input) & 0xf) { | 
|  | 173 | +        case 0x0: | 
|  | 174 | +            // the data is 16-byte aligned, so do the fast processing of the bulk of the | 
|  | 175 | +            // samples | 
|  | 176 | +            convert_fc32_1_to_item32_1_guts(_) break; | 
|  | 177 | +        case 0x8: | 
|  | 178 | +            // the first sample is 8-byte aligned - process it to align the remainder of | 
|  | 179 | +            // the samples to 16-bytes | 
|  | 180 | +            xx_to_chdr_sc16(input, output, 1, scale_factor); | 
|  | 181 | +            i++; | 
|  | 182 | +            // do faster processing of the bulk of the samples now that we are 16-byte | 
|  | 183 | +            // aligned | 
|  | 184 | +            convert_fc32_1_to_item32_1_guts(_) break; | 
|  | 185 | +        default: | 
|  | 186 | +            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned | 
|  | 187 | +            // load | 
|  | 188 | +            convert_fc32_1_to_item32_1_guts(u_) | 
|  | 189 | +    } | 
|  | 190 | + | 
|  | 191 | +    // convert any remaining samples | 
|  | 192 | +    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor); | 
|  | 193 | +} | 
0 commit comments