diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt
index aee62c82f4..7f7d608070 100644
--- a/host/lib/convert/CMakeLists.txt
+++ b/host/lib/convert/CMakeLists.txt
@@ -9,29 +9,51 @@
 # This file included, use CMake directory variables
 ########################################################################
 include(CheckIncludeFileCXX)
+include(CheckCXXCompilerFlag)
 message(STATUS "")
 
 ########################################################################
-# Check for SSE2 SIMD headers
+# Check for SIMD headers
 ########################################################################
+
+# Check for SSE2 support
+check_cxx_compiler_flag("-msse2" SSE2_SUPPORTED)
+if(SSE2_SUPPORTED)
+    message(STATUS "SSE2 is supported")
+endif(SSE2_SUPPORTED)
+
+# Check for SSE3 support
+check_cxx_compiler_flag("-msse3" SSE3_SUPPORTED)
+if(SSE3_SUPPORTED)
+    message(STATUS "SSE3 is supported")
+    set(SSE2_SUPPORTED OFF)
+endif(SSE3_SUPPORTED)
+
+# Check for AVX2 support
+check_cxx_compiler_flag("-mavx2" AVX2_SUPPORTED)
+# set(AVX2_SUPPORTED OFF)
+if(AVX2_SUPPORTED)
+    message(STATUS "AVX2 is supported")
+    # set(SSE3_SUPPORTED OFF)
+endif(AVX2_SUPPORTED)
+
+# Check for AVX2 support
+check_cxx_compiler_flag("-mavx512" AVX512_SUPPORTED)
+if(AVX512_SUPPORTED)
+    message(STATUS "AVX512 is supported")
+    set(AVX2_SUPPORTED OFF)
+endif(AVX512_SUPPORTED)
+
 if(CMAKE_COMPILER_IS_GNUCXX)
-    set(EMMINTRIN_FLAGS -msse2)
-    set(TMMINTRIN_FLAGS -mssse3)
+    set(SSE2_FLAGS -msse2)
+    set(SSE3_FLAGS -mssse3)
+    set(AVX2_FLAGS -mavx2)
+    set(AVX512_FLAGS -mavx512)
 elseif(MSVC)
-    set(EMMINTRIN_FLAGS /arch:SSE2)
+    set(SSE2_FLAGS /arch:SSE2)
 endif()
 
-set(CMAKE_REQUIRED_FLAGS ${EMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-
-if(ENABLE_SSSE3)
-set(CMAKE_REQUIRED_FLAGS ${TMMINTRIN_FLAGS})
-CHECK_INCLUDE_FILE_CXX(tmmintrin.h HAVE_TMMINTRIN_H)
-unset(CMAKE_REQUIRED_FLAGS)
-endif(ENABLE_SSSE3)
-
-if(HAVE_EMMINTRIN_H)
+if(SSE2_SUPPORTED)
     set(convert_with_sse2_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_sc16.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc16_to_fc64.cpp
@@ -45,22 +67,41 @@ if(HAVE_EMMINTRIN_H)
     )
     set_source_files_properties(
         ${convert_with_sse2_sources}
-        PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE2_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources})
-endif(HAVE_EMMINTRIN_H)
+endif(SSE2_SUPPORTED)
 
-if(HAVE_TMMINTRIN_H)
+if(SSE3_SUPPORTED)
     set(convert_with_ssse3_sources
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_pack_sc12.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/ssse3_unpack_sc12.cpp
     )
     set_source_files_properties(
         ${convert_with_ssse3_sources}
-        PROPERTIES COMPILE_FLAGS "${TMMINTRIN_FLAGS}"
+        PROPERTIES COMPILE_FLAGS "${SSE3_FLAGS}"
     )
     LIBUHD_APPEND_SOURCES(${convert_with_ssse3_sources})
-endif(HAVE_TMMINTRIN_H)
+endif(SSE3_SUPPORTED)
+
+if(AVX2_SUPPORTED)
+    set(convert_with_avx2_sources
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc64.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc16_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/sse2_sc8_to_fc64.cpp # AVX2 conversion is not efficient as SSE2 for this case
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_sc8_to_fc32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc16.cpp 
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc16.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc64_to_sc8.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/avx2_fc32_to_sc8.cpp
+    )
+    set_source_files_properties(
+        ${convert_with_avx2_sources}
+        PROPERTIES COMPILE_FLAGS "${AVX2_FLAGS} ${SSE2_FLAGS}"
+    )
+    LIBUHD_APPEND_SOURCES(${convert_with_avx2_sources})
+endif(AVX2_SUPPORTED)
 
 ########################################################################
 # Check for NEON SIMD headers
diff --git a/host/lib/convert/avx2_fc32_to_sc16.cpp b/host/lib/convert/avx2_fc32_to_sc16.cpp
new file mode 100644
index 0000000000..bf18c41f13
--- /dev/null
+++ b/host/lib/convert/avx2_fc32_to_sc16.cpp
@@ -0,0 +1,193 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Brand
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_nswap_guts(_al_)                               \
+    for (; i + 7 < nsamps; i += 8) {                                              \
+        /* load from input */                                                     \
+        __m256 tmplo =                                                            \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0)); \
+        __m256 tmphi =                                                            \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4)); \
+                                                                                  \
+        /* convert and scale */                                                   \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));        \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));        \
+                                                                                  \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                          \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */       \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                          \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */       \
+                                                                                  \
+        /* now pack the shuffled data sequentially */                             \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);              \
+                                                                                  \
+        /* pack + swap 16-bit pairs */                                            \
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));             \
+                                                                                  \
+        /* store to output */                                                     \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);        \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_nswap_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            xx_to_item32_sc16<uhd::htowx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_nswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_nswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using AVX2 intrinsics to convert 8 values at a
+// time
+#define convert_fc32_1_to_item32_1_bswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256 tmplo =                                                                  \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));       \
+        __m256 tmphi =                                                                  \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));       \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));              \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));              \
+                                                                                        \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                                \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */             \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                                \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */             \
+                                                                                        \
+        /* Now pack the shuffled data sequentially */                                   \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);                    \
+                                                                                        \
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);              \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_bswap_guts(_) break;
+        case 0x8:
+            // the first value is 8-byte aligned - process it and prepare the bulk of the
+            // data for fast conversion
+            xx_to_item32_sc16<uhd::htonx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the remaining samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_bswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_bswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc16_chdr, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_fc32_1_to_item32_1_guts(_al_)                                      \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256 tmplo =                                                             \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));  \
+        __m256 tmphi =                                                             \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));  \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m256i tmpilo = _mm256_cvtps_epi32(_mm256_mul_ps(tmplo, scalar));         \
+        __m256i tmpihi = _mm256_cvtps_epi32(_mm256_mul_ps(tmphi, scalar));         \
+                                                                                   \
+        /* mm256_packs_epi32 is not sequential, it needs to be split into m128i */ \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(                           \
+            tmpilo, tmpihi, 0x20); /* lower 128-bit of tmpilo and tmpihi */        \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(                           \
+            tmpilo, tmpihi, 0x31); /* upper 128-bit of tmpilo and tmpihi */        \
+                                                                                   \
+        /* Now pack the shuffled data sequentially */                              \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);               \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);         \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_fc32_1_to_item32_1_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            xx_to_chdr_sc16(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_fc32_1_to_item32_1_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            convert_fc32_1_to_item32_1_guts(u_)
+    }
+
+    // convert any remaining samples
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc32_to_sc8.cpp b/host/lib/convert/avx2_fc32_to_sc8.cpp
new file mode 100644
index 0000000000..60a0407e37
--- /dev/null
+++ b/host/lib/convert/avx2_fc32_to_sc8.cpp
@@ -0,0 +1,123 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+template <const int shuf>
+UHD_INLINE __m256i pack_sc32_4x(const __m256& in0,
+    const __m256& in1,
+    const __m256& in2,
+    const __m256& in3,
+    const __m256& scalar)
+{
+    __m256i tmpi0 = _mm256_cvtps_epi32(_mm256_mul_ps(in0, scalar));
+    tmpi0         = _mm256_shuffle_epi32(tmpi0, shuf);
+    __m256i tmpi1 = _mm256_cvtps_epi32(_mm256_mul_ps(in1, scalar));
+    tmpi1         = _mm256_shuffle_epi32(tmpi1, shuf);
+
+    __m256i shuf_lo_lo = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x20);
+    __m256i shuf_lo_hi = _mm256_permute2x128_si256(tmpi0, tmpi1, 0x31);
+    const __m256i lo    = _mm256_packs_epi32(shuf_lo_lo, shuf_lo_hi);
+
+    __m256i tmpi2    = _mm256_cvtps_epi32(_mm256_mul_ps(in2, scalar));
+    tmpi2            = _mm256_shuffle_epi32(tmpi2, shuf);
+    __m256i tmpi3    = _mm256_cvtps_epi32(_mm256_mul_ps(in3, scalar));
+    tmpi3            = _mm256_shuffle_epi32(tmpi3, shuf);
+
+    __m256i shuf_hi_lo = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x20);
+    __m256i shuf_hi_hi = _mm256_permute2x128_si256(tmpi2, tmpi3, 0x31);
+    const __m256i hi    = _mm256_packs_epi32(shuf_hi_lo, shuf_hi_hi);
+
+    __m256i shuf_lo = _mm256_permute2x128_si256(lo, hi, 0x20);
+    __m256i shuf_hi = _mm256_permute2x128_si256(lo, hi, 0x31);
+
+    return _mm256_packs_epi16(shuf_lo, shuf_hi);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_be, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);
+
+#define convert_fc32_1_to_sc8_item32_1_bswap_guts(_al_)                            \
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {                         \
+        /* load from input */                                                      \
+        __m256 tmp0 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));  \
+        __m256 tmp1 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));  \
+        __m256 tmp2 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 8));  \
+        __m256 tmp3 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 12)); \
+                                                                                   \
+        /* convert */                                                              \
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);   \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);         \
+    }
+
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc32_1_to_sc8_item32_1_bswap_guts(_)
+    } else {
+        convert_fc32_1_to_sc8_item32_1_bswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc32, 1, sc8_item32_le, 1, PRIORITY_SIMD)
+{
+    const fc32_t* input = reinterpret_cast<const fc32_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor));
+    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);
+
+#define convert_fc32_1_to_sc8_item32_1_nswap_guts(_al_)                            \
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {                         \
+        /* load from input */                                                      \
+        __m256 tmp0 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 0));  \
+        __m256 tmp1 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 4));  \
+        __m256 tmp2 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 8));  \
+        __m256 tmp3 =                                                              \
+            _mm256_load##_al_##ps(reinterpret_cast<const float*>(input + i + 12)); \
+                                                                                   \
+        /* convert */                                                              \
+        const __m256i tmpi = pack_sc32_4x<shuf>(tmp0, tmp1, tmp2, tmp3, scalar);   \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);         \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc32_1_to_sc8_item32_1_nswap_guts(_)
+    } else {
+        convert_fc32_1_to_sc8_item32_1_nswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc64_to_sc16.cpp b/host/lib/convert/avx2_fc64_to_sc16.cpp
new file mode 100644
index 0000000000..fde67ef01f
--- /dev/null
+++ b/host/lib/convert/avx2_fc64_to_sc16.cpp
@@ -0,0 +1,179 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(fc64, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+#define convert_fc64_1_to_item32_1_nswap_guts(_al_)                                \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256d tmp0 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+        __m256d tmp1 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+        __m256d tmp2 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \
+        __m256d tmp3 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));          \
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));          \
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));          \
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));          \
+                                                                                   \
+        /* Unpack and interleave the results */                                    \
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);                           \
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);                           \
+                                                                                   \
+        /* BEFORE PACKS */                                                         \
+        /* Pack and swap 16-bit pairs */                                           \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);     \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);     \
+                                                                                   \
+        /* pack + swap 16-bit pairs */                                             \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);               \
+                                                                                   \
+        /* pack + swap 16-bit pairs */                                             \
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));              \
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));              \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);         \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc64_1_to_item32_1_nswap_guts(_)
+    } else {
+        convert_fc64_1_to_item32_1_nswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+#define convert_fc64_1_to_item32_1_bswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256d tmp0 =                                                                  \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));      \
+        __m256d tmp1 =                                                                  \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));      \
+        __m256d tmp2 =                                                                  \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));      \
+        __m256d tmp3 =                                                                  \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));      \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));               \
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));               \
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));               \
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));               \
+                                                                                        \
+                                                                                        \
+        /* Unpack and interleave the results */                                         \
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);                                \
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);                                \
+                                                                                        \
+        /* BEFORE PACKS */                                                              \
+        /* Pack and swap 16-bit pairs */                                                \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);          \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);          \
+                                                                                        \
+        /* pack + swap 16-bit pairs */                                                  \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);                    \
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);              \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc64_1_to_item32_1_bswap_guts(_)
+    } else {
+        convert_fc64_1_to_item32_1_bswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc16_chdr, 1, PRIORITY_SIMD)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    sc16_t* output      = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+#define convert_fc64_1_to_chdr_1_guts(_al_)                                        \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256d tmp0 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0)); \
+        __m256d tmp1 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2)); \
+        __m256d tmp2 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4)); \
+        __m256d tmp3 =                                                             \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6)); \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m128i tmpi0 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp0, scalar));         \
+        __m128i tmpi1 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp1, scalar));         \
+        __m128i tmpi2 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp2, scalar));         \
+        __m128i tmpi3 = _mm256_cvttpd_epi32(_mm256_mul_pd(tmp3, scalar));         \
+                                                                                   \
+        /* Unpack and interleave the results */                                    \
+        __m256i tmpilo = _mm256_set_m128i(tmpi1, tmpi0);                           \
+        __m256i tmpihi = _mm256_set_m128i(tmpi3, tmpi2);                           \
+                                                                                   \
+        /* BEFORE PACKS */                                                         \
+        /* Pack and swap 16-bit pairs */                                           \
+        __m256i shuffled_lo = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20);     \
+        __m256i shuffled_hi = _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31);     \
+                                                                                   \
+        /* pack + swap 16-bit pairs */                                             \
+        __m256i tmpi = _mm256_packs_epi32(shuffled_lo, shuffled_hi);               \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + i), tmpi);         \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc64_1_to_chdr_1_guts(_)
+    } else {
+        convert_fc64_1_to_chdr_1_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_chdr_sc16(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_fc64_to_sc8.cpp b/host/lib/convert/avx2_fc64_to_sc8.cpp
new file mode 100644
index 0000000000..2f31d0cf8c
--- /dev/null
+++ b/host/lib/convert/avx2_fc64_to_sc8.cpp
@@ -0,0 +1,135 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+UHD_INLINE __m256i pack_sc8_item32_4x(
+    const __m256i& in0, const __m256i& in1, const __m256i& in2, const __m256i& in3)
+{
+    const __m256i shuffled_in0_lo = _mm256_permute2x128_si256(in0, in1, 0x20);
+    const __m256i shuffled_in0_hi = _mm256_permute2x128_si256(in0, in1, 0x31);
+    const __m256i shuffled_in1_lo = _mm256_permute2x128_si256(in2, in3, 0x20);
+    const __m256i shuffled_in1_hi = _mm256_permute2x128_si256(in2, in3, 0x31);
+
+    const __m256i lo    = _mm256_packs_epi32(shuffled_in0_lo, shuffled_in0_hi);
+    const __m256i hi    = _mm256_packs_epi32(shuffled_in1_lo, shuffled_in1_hi);
+    return _mm256_packs_epi16(lo, hi);
+}
+
+UHD_INLINE __m256i pack_sc32_4x(
+    const __m256d& lo, const __m256d& hi, const __m256d& scalar)
+{
+    const __m128i tmpi_lo = _mm256_cvttpd_epi32(_mm256_mul_pd(hi, scalar));
+    const __m128i tmpi_hi = _mm256_cvttpd_epi32(_mm256_mul_pd(lo, scalar));
+
+    return _mm256_set_m128i(tmpi_hi, tmpi_lo);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc8_item32_be, 1, PRIORITY_SIMD)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+#define convert_fc64_1_to_sc8_item32_1_bswap_guts(_al_)                             \
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {                            \
+        /* load from input */                                                       \
+        __m256d tmp0 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));  \
+        __m256d tmp1 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));  \
+        __m256d tmp2 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));  \
+        __m256d tmp3 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));  \
+        __m256d tmp4 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 8));  \
+        __m256d tmp5 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 10)); \
+        __m256d tmp6 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 12)); \
+        __m256d tmp7 =                                                              \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 14)); \
+                                                                                    \
+        /* interleave */                                                            \
+        const __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp1, tmp0, scalar),   \
+            pack_sc32_4x(tmp3, tmp2, scalar),                                       \
+            pack_sc32_4x(tmp5, tmp4, scalar),                                       \
+            pack_sc32_4x(tmp7, tmp6, scalar));                                      \
+                                                                                    \
+        /* store to output */                                                       \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);          \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc64_1_to_sc8_item32_1_bswap_guts(_)
+    } else {
+        convert_fc64_1_to_sc8_item32_1_bswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htonx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(fc64, 1, sc8_item32_le, 1, PRIORITY_SIMD)
+{
+    const fc64_t* input = reinterpret_cast<const fc64_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor);
+
+#define convert_fc64_1_to_sc8_item32_1_nswap_guts(_al_)                                   \
+    for (size_t j = 0; i + 15 < nsamps; i += 16, j += 8) {                                  \
+        /* load from input */                                                             \
+        __m256d tmp0 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 0));        \
+        __m256d tmp1 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 2));        \
+        __m256d tmp2 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 4));        \
+        __m256d tmp3 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 6));        \
+        __m256d tmp4 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 8));        \
+        __m256d tmp5 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 10));       \
+        __m256d tmp6 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 12));       \
+        __m256d tmp7 =                                                                    \
+            _mm256_load##_al_##pd(reinterpret_cast<const double*>(input + i + 14));       \
+                                                                                          \
+        /* interleave */                                                                  \
+        __m256i tmpi = pack_sc8_item32_4x(pack_sc32_4x(tmp0, tmp1, scalar),               \
+            pack_sc32_4x(tmp2, tmp3, scalar),                                             \
+            pack_sc32_4x(tmp4, tmp5, scalar),                                             \
+            pack_sc32_4x(tmp6, tmp7, scalar));                                            \
+        tmpi         = _mm256_or_si256(                                                   \
+            _mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); /*byteswap*/ \
+                                                                                          \
+        /* store to output */                                                             \
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(output + j), tmpi);                \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(input) & 0xf) == 0) {
+        convert_fc64_1_to_sc8_item32_1_nswap_guts(_)
+    } else {
+        convert_fc64_1_to_sc8_item32_1_nswap_guts(u_)
+    }
+
+    // convert remainder
+    xx_to_item32_sc8<uhd::htowx>(input + i, output + (i / 2), nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_sc16_to_fc32.cpp b/host/lib/convert/avx2_sc16_to_fc32.cpp
new file mode 100644
index 0000000000..509d7583c3
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_fc32.cpp
@@ -0,0 +1,194 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(sc16_item32_le, 1, fc32, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16));
+    const __m256i zeroi = _mm256_setzero_si256();
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_item32_1_to_fc32_1_nswap_guts(_al_)                                \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256i tmpi =                                                             \
+            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                   \
+        /* unpack + swap 16-bit pairs */                                           \
+        tmpi = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));              \
+        tmpi = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));              \
+                                                                                   \
+        __m256i tmpilo =                                                           \
+            _mm256_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */       \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                       \
+                                                                                   \
+        __m256i shuffled_lo =                                                      \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit  */  \
+        __m256i shuffled_hi =                                                      \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit  */  \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar);     \
+        __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar);     \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), tmplo);   \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), tmphi);   \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(output) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_item32_1_to_fc32_1_nswap_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            item32_sc16_to_xx<uhd::htowx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_item32_1_to_fc32_1_nswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load and store
+            convert_item32_1_to_fc32_1_nswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, fc32, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16));
+    const __m256i zeroi = _mm256_setzero_si256();
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_item32_1_to_fc32_1_bswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* byteswap + unpack -> byteswap 16 bit words */                                \
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
+                                                                                        \
+        __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi);                            \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                            \
+                                                                                        \
+        __m256i shuffled_lo =                                                           \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit  */       \
+        __m256i shuffled_hi =                                                           \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit  */       \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar);          \
+        __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar);          \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), tmplo);              \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), tmphi);              \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(output) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_item32_1_to_fc32_1_bswap_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            item32_sc16_to_xx<uhd::htonx>(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_item32_1_to_fc32_1_bswap_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load and store
+            convert_item32_1_to_fc32_1_bswap_guts(u_)
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_chdr, 1, fc32, 1, PRIORITY_SIMD)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    fc32_t* output      = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 16));
+    const __m256i zeroi = _mm256_setzero_si256();
+
+// this macro converts values faster by using SSE intrinsics to convert 4 values at a time
+#define convert_item32_1_to_fc32_1_guts(_al_)                                      \
+    for (; i + 7 < nsamps; i += 8) {                                               \
+        /* load from input */                                                      \
+        __m256i tmpi =                                                             \
+            _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                   \
+        /* unpack + swap 16-bit pairs */                                           \
+        __m256i tmpilo =                                                           \
+            _mm256_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */       \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                       \
+                                                                                   \
+        __m256i shuffled_lo =                                                      \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x20); /* lower 128-bit  */  \
+        __m256i shuffled_hi =                                                      \
+            _mm256_permute2x128_si256(tmpilo, tmpihi, 0x31); /* upper 128-bit  */  \
+                                                                                   \
+        /* convert and scale */                                                    \
+        __m256 tmplo = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_lo), scalar);     \
+        __m256 tmphi = _mm256_mul_ps(_mm256_cvtepi32_ps(shuffled_hi), scalar);     \
+                                                                                   \
+        /* store to output */                                                      \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 0), tmplo);   \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + i + 4), tmphi);   \
+    }
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(output) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            convert_item32_1_to_fc32_1_guts(_) break;
+        case 0x8:
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            chdr_sc16_to_xx(input, output, 1, scale_factor);
+            i++;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            convert_item32_1_to_fc32_1_guts(_) break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load and store
+            convert_item32_1_to_fc32_1_guts(u_)
+    }
+
+    // convert any remaining samples
+    chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_sc16_to_fc64.cpp b/host/lib/convert/avx2_sc16_to_fc64.cpp
new file mode 100644
index 0000000000..c3e9e3371a
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_fc64.cpp
@@ -0,0 +1,165 @@
+//
+// Copyright 2011-2012 Ettus Research LLC
+// Copyright 2018 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+DECLARE_CONVERTER(sc16_item32_le, 1, fc64, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16));
+    const __m256i zeroi  = _mm256_setzero_si256();
+
+#define convert_item32_1_to_fc64_1_nswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* unpack + swap 16-bit pairs */                                                \
+        tmpi           = _mm256_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));         \
+        tmpi           = _mm256_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));         \
+        __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi);                            \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                            \
+                                                                                        \
+        __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo);                             \
+        __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1);                        \
+        __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi);                             \
+        __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1);                        \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar);            \
+        tmpilo       = _mm256_unpackhi_epi64(tmpilo, zeroi);                            \
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar);            \
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar);            \
+        tmpihi       = _mm256_unpackhi_epi64(tmpihi, zeroi);                            \
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar);            \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);              \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(output) & 0xf) == 0) {
+        convert_item32_1_to_fc64_1_nswap_guts(_)
+    } else {
+        convert_item32_1_to_fc64_1_nswap_guts(u_)
+    }
+
+    // convert remainder
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, fc64, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    fc64_t* output        = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16));
+    const __m256i zeroi  = _mm256_setzero_si256();
+
+#define convert_item32_1_to_fc64_1_bswap_guts(_al_)                                     \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* byteswap + unpack -> byteswap 16 bit words */                                \
+        tmpi = _mm256_or_si256(_mm256_srli_epi16(tmpi, 8), _mm256_slli_epi16(tmpi, 8)); \
+        __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi);                            \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                            \
+                                                                                        \
+        __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo);                             \
+        __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1);                        \
+        __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi);                             \
+        __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1);                        \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar);            \
+        tmpilo       = _mm256_unpackhi_epi64(tmpilo, zeroi);                            \
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar);            \
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar);            \
+        tmpihi       = _mm256_unpackhi_epi64(tmpihi, zeroi);                            \
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar);            \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);              \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(output) & 0xf) == 0) {
+        convert_item32_1_to_fc64_1_bswap_guts(_)
+    } else {
+        convert_item32_1_to_fc64_1_bswap_guts(u_)
+    }
+
+    // convert remainder
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, scale_factor);
+}
+
+DECLARE_CONVERTER(sc16_chdr, 1, fc64, 1, PRIORITY_SIMD)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    fc64_t* output      = reinterpret_cast<fc64_t*>(outputs[0]);
+
+    const __m256d scalar = _mm256_set1_pd(scale_factor / (1 << 16));
+    const __m256i zeroi  = _mm256_setzero_si256();
+
+#define convert_chdr_1_to_fc64_1_guts(_al_)                                             \
+    for (; i + 7 < nsamps; i += 8) {                                                    \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* unpack 16-bit pairs */                                                       \
+        __m256i tmpilo = _mm256_unpacklo_epi16(zeroi, tmpi);                            \
+        __m256i tmpihi = _mm256_unpackhi_epi16(zeroi, tmpi);                            \
+                                                                                        \
+        __m128i tmpilo_lo = _mm256_castsi256_si128(tmpilo);                             \
+        __m128i tmpilo_hi = _mm256_extracti128_si256(tmpilo, 1);                        \
+        __m128i tmpihi_lo = _mm256_castsi256_si128(tmpihi);                             \
+        __m128i tmpihi_hi = _mm256_extracti128_si256(tmpihi, 1);                        \
+                                                                                        \
+        /* convert and scale */                                                         \
+        __m256d tmp0 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_lo), scalar);            \
+        tmpilo       = _mm256_unpackhi_epi64(tmpilo, zeroi);                            \
+        __m256d tmp1 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_lo), scalar);            \
+        __m256d tmp2 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpilo_hi), scalar);            \
+        tmpihi       = _mm256_unpackhi_epi64(tmpihi, zeroi);                            \
+        __m256d tmp3 = _mm256_mul_pd(_mm256_cvtepi32_pd(tmpihi_hi), scalar);            \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 0), tmp0);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 2), tmp1);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 4), tmp2);              \
+        _mm256_storeu_pd(reinterpret_cast<double*>(output + i + 6), tmp3);              \
+    }
+
+    size_t i = 0;
+
+    // dispatch according to alignment
+    if ((size_t(output) & 0xf) == 0) {
+        convert_chdr_1_to_fc64_1_guts(_)
+    } else {
+        convert_chdr_1_to_fc64_1_guts(u_)
+    }
+
+    // convert remainder
+    chdr_sc16_to_xx(input + i, output + i, nsamps - i, scale_factor);
+}
diff --git a/host/lib/convert/avx2_sc16_to_sc16.cpp b/host/lib/convert/avx2_sc16_to_sc16.cpp
new file mode 100644
index 0000000000..62fcb1455e
--- /dev/null
+++ b/host/lib/convert/avx2_sc16_to_sc16.cpp
@@ -0,0 +1,210 @@
+//
+// Copyright 2024 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+//
+// SSE 16-bit pair swap
+//
+// Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned
+// access respectively. Macro operates on 4 complex 16-bit integers at a time.
+//
+//       -----------------
+//      | A | B | C | D |   Input
+//      -----------------
+//        0   1   2   3     Address
+//      -----------------
+//      | C | D | A | B |   Output
+//      -----------------
+//
+#define CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_ialign_, _oalign_)         \
+    for (; i + 7 < nsamps; i += 8) {                                    \
+        __m256i m0;                                                     \
+                                                                        \
+        /* load from input */                                           \
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i)); \
+                                                                        \
+        /* swap 16-bit pairs */                                         \
+        m0 = _mm256_shufflelo_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));       \
+        m0 = _mm256_shufflehi_epi16(m0, _MM_SHUFFLE(2, 3, 0, 1));       \
+                                                                        \
+        /* store to output */                                           \
+        _mm256_storeu_si256((__m256i*)(output + i), m0);      \
+    }
+
+//
+// SSE byte swap
+//
+// Valid alignment macro arguments are 'u_' and '_' for unaligned and aligned
+// access respectively. Macro operates on 4 complex 16-bit integers at a time.
+//
+//       -----------------
+//      | A | B | C | D |   Input
+//      -----------------
+//        0   1   2   3     Address
+//      -----------------
+//      | B | A | D | C |   Output
+//      -----------------
+//
+#define CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_ialign_, _oalign_)      \
+    for (; i + 7 < nsamps; i += 8) {                                 \
+        __m256i m0, m1, m2;                                          \
+                                                                     \
+        /* load from input */                                        \
+        m0 = _mm256_loadu_si256((const __m256i*)(input + i)); \
+                                                                     \
+        /* byteswap 16 bit words */                                  \
+        m1 = _mm256_srli_epi16(m0, 8);                                  \
+        m2 = _mm256_slli_epi16(m0, 8);                                  \
+        m0 = _mm256_or_si256(m1, m2);                                   \
+                                                                     \
+        /* store to output */                                        \
+        _mm256_storeu_si256((__m256i*)(output + i), m0);      \
+    }
+
+DECLARE_CONVERTER(sc16, 1, sc16_item32_le, 1, PRIORITY_SIMD)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_)
+            break;
+        case 0x8:
+            if (nsamps < 2)
+                break;
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            xx_to_item32_sc16<uhd::htowx>(input, output, 2, 1.0);
+            i += 2;
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(_, u_)
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16, 1, sc16_item32_be, 1, PRIORITY_SIMD)
+{
+    const sc16_t* input = reinterpret_cast<const sc16_t*>(inputs[0]);
+    item32_t* output    = reinterpret_cast<item32_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(input) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_)
+            break;
+        case 0x8:
+            if (nsamps < 2)
+                break;
+            // the first value is 8-byte aligned - process it and prepare the bulk of the
+            // data for fast conversion
+            xx_to_item32_sc16<uhd::htonx>(input, output, 2, 1.0);
+            i += 2;
+            // do faster processing of the remaining samples now that we are 16-byte
+            // aligned
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(_, u_)
+            break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)
+    }
+
+    // convert any remaining samples
+    xx_to_item32_sc16<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16_item32_le, 1, sc16, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(output) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _)
+            break;
+        case 0x8:
+            if (nsamps < 2)
+                break;
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            item32_sc16_to_xx<uhd::htowx>(input, output, 2, 1.0);
+            i += 2;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, _)
+            break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load and store
+            CONVERT_SC16_1_TO_SC16_1_NSWAP_GUTS(u_, u_)
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htowx>(input + i, output + i, nsamps - i, 1.0);
+}
+
+DECLARE_CONVERTER(sc16_item32_be, 1, sc16, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(inputs[0]);
+    sc16_t* output        = reinterpret_cast<sc16_t*>(outputs[0]);
+
+    size_t i = 0;
+
+    // need to dispatch according to alignment for fastest conversion
+    switch (size_t(output) & 0xf) {
+        case 0x0:
+            // the data is 16-byte aligned, so do the fast processing of the bulk of the
+            // samples
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _)
+            break;
+        case 0x8:
+            if (nsamps < 2)
+                break;
+            // the first sample is 8-byte aligned - process it to align the remainder of
+            // the samples to 16-bytes
+            item32_sc16_to_xx<uhd::htonx>(input, output, 2, 1.0);
+            i += 2;
+            // do faster processing of the bulk of the samples now that we are 16-byte
+            // aligned
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, _)
+            break;
+        default:
+            // we are not 8 or 16-byte aligned, so do fast processing with the unaligned
+            // load and store
+            CONVERT_SC16_1_TO_SC16_1_BSWAP_GUTS(u_, u_)
+    }
+
+    // convert any remaining samples
+    item32_sc16_to_xx<uhd::htonx>(input + i, output + i, nsamps - i, 1.0);
+}
diff --git a/host/lib/convert/avx2_sc8_to_fc32.cpp b/host/lib/convert/avx2_sc8_to_fc32.cpp
new file mode 100644
index 0000000000..2d73444e34
--- /dev/null
+++ b/host/lib/convert/avx2_sc8_to_fc32.cpp
@@ -0,0 +1,122 @@
+//
+// Copyright 2012-2013 Ettus Research LLC
+// Copyright 2018 Ettus Research, a National Instruments Company
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+
+#include "convert_common.hpp"
+#include <uhd/utils/byteswap.hpp>
+#include <immintrin.h>
+
+using namespace uhd::convert;
+
+static const __m256i zeroi = _mm256_setzero_si256();
+
+template <const int shuf>
+UHD_INLINE void unpack_sc32_4x(const __m256i& in,
+    __m256& out0,
+    __m256& out1,
+    __m256& out2,
+    __m256& out3,
+    const __m256& scalar)
+{
+    const __m256i tmplo = _mm256_unpacklo_epi8(zeroi, in); /* value in upper 8 bits */
+    __m256i tmp0        = _mm256_shuffle_epi32(
+        _mm256_unpacklo_epi16(zeroi, tmplo), shuf); /* value in upper 16 bits */
+    __m256i tmp1 = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmplo), shuf);
+    out0         = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp0), scalar);
+    out1         = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp1), scalar);
+
+    const __m256i tmphi = _mm256_unpackhi_epi8(zeroi, in);
+    __m256i tmp2        = _mm256_shuffle_epi32(_mm256_unpacklo_epi16(zeroi, tmphi), shuf);
+    __m256i tmp3        = _mm256_shuffle_epi32(_mm256_unpackhi_epi16(zeroi, tmphi), shuf);
+    out2                = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp2), scalar);
+    out3                = _mm256_mul_ps(_mm256_cvtepi32_ps(tmp3), scalar);
+}
+
+DECLARE_CONVERTER(sc8_item32_be, 1, fc32, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24));
+    const int shuf      = _MM_SHUFFLE(3, 2, 1, 0);
+
+    size_t i = 0, j = 0;
+    size_t num_samps = nsamps;
+
+    if ((size_t(inputs[0]) & 0x3) != 0) {
+        item32_sc8_to_xx<uhd::ntohx>(input++, output++, 1, scale_factor);
+        num_samps--;
+    }
+
+#define convert_sc8_item32_1_to_fc32_1_bswap_guts(_al_)                                 \
+    for (; j + 15 < num_samps; j += 16, i += 8) {                                       \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* unpack + swap 8-bit pairs */                                                 \
+        __m256 tmp0, tmp1, tmp2, tmp3;                                                  \
+        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);                     \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 0), tmp0);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 4), tmp1);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 8), tmp2);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 12), tmp3);              \
+    }
+
+    // dispatch according to alignment
+    if ((size_t(output) & 0xf) == 0) {
+        convert_sc8_item32_1_to_fc32_1_bswap_guts(_)
+    } else {
+        convert_sc8_item32_1_to_fc32_1_bswap_guts(u_)
+    }
+
+    // convert remainder
+    item32_sc8_to_xx<uhd::ntohx>(input + i, output + j, num_samps - j, scale_factor);
+}
+
+DECLARE_CONVERTER(sc8_item32_le, 1, fc32, 1, PRIORITY_SIMD)
+{
+    const item32_t* input = reinterpret_cast<const item32_t*>(size_t(inputs[0]) & ~0x3);
+    fc32_t* output        = reinterpret_cast<fc32_t*>(outputs[0]);
+
+    const __m256 scalar = _mm256_set1_ps(float(scale_factor) / (1 << 24));
+    const int shuf      = _MM_SHUFFLE(0, 1, 2, 3);
+
+    size_t i = 0, j = 0;
+    size_t num_samps = nsamps;
+
+    if ((size_t(inputs[0]) & 0x3) != 0) {
+        item32_sc8_to_xx<uhd::wtohx>(input++, output++, 1, scale_factor);
+        num_samps--;
+    }
+
+#define convert_sc8_item32_1_to_fc32_1_nswap_guts(_al_)                                 \
+    for (; j + 15 < num_samps; j += 16, i += 8) {                                         \
+        /* load from input */                                                           \
+        __m256i tmpi = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(input + i)); \
+                                                                                        \
+        /* unpack + swap 8-bit pairs */                                                 \
+        __m256 tmp0, tmp1, tmp2, tmp3;                                                  \
+        unpack_sc32_4x<shuf>(tmpi, tmp0, tmp1, tmp2, tmp3, scalar);                     \
+                                                                                        \
+        /* store to output */                                                           \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 0), tmp0);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 4), tmp1);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 8), tmp2);               \
+        _mm256_storeu_ps(reinterpret_cast<float*>(output + j + 12), tmp3);              \
+    }
+
+    // dispatch according to alignment
+    if ((size_t(output) & 0xf) == 0) {
+        convert_sc8_item32_1_to_fc32_1_nswap_guts(_)
+    } else {
+        convert_sc8_item32_1_to_fc32_1_nswap_guts(u_)
+    }
+
+    // convert remainder
+    item32_sc8_to_xx<uhd::wtohx>(input + i, output + j, num_samps - j, scale_factor);
+}
diff --git a/host/tests/convert_test.cpp b/host/tests/convert_test.cpp
index 7fd3f2564f..fd71f9569b 100644
--- a/host/tests/convert_test.cpp
+++ b/host/tests/convert_test.cpp
@@ -1080,7 +1080,7 @@ BOOST_AUTO_TEST_CASE(test_convert_types_fc32_and_sc8)
     }
 }
 
-BOOST_TEST_DECORATOR(*boost::unit_test::disabled())
+// BOOST_TEST_DECORATOR(*boost::unit_test::disabled())
 BOOST_AUTO_TEST_CASE(benchmark_convert_types_fc32_and_sc8)
 {
     SKIP_BENCHMARK_CHECK;