diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..440c6f9
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,321 @@
+####################################################################################
+# START 1. Basic setup for cmake
+####################################################################################
+# basic setup for cmake
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+if(POLICY CMP0074)
+  cmake_policy(SET CMP0074 NEW)
+endif()
+
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON)
+set(CMAKE_COLOR_MAKEFILE ON)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+# Disable gnu exentions
+set(CMAKE_CXX_EXTENSIONS ON)
+
+# Define the project
+project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES C CXX)
+
+# For GCC 8 and lower, set -pthread flag manually
+set(CMAKE_C_FLAGS "-pthread")
+set(CMAKE_CXX_FLAGS "-pthread")
+
+# add a directory for cmake modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
+# DSA_XENGINE may be built to run using CUDA or CPU. Future version may be
+# written for HIP or SYCL, which we call the
+# Target type. By default, the target is CUDA.
+#---------------------------------------------
+
+# Set by environment variable if visible
+if(DEFINED ENV{DSA_XENGINE_TARGET})
+  set(DEFTARGET $ENV{DSA_XENGINE_TARGET})
+else()
+  set(DEFTARGET "CUDA")
+endif()
+
+set(VALID_TARGET_TYPES CUDA CPU) #HIP SYCL
+set(DSA_XENGINE_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
+set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS "CUDA" "CPU") # HIP SYCL
+
+string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE)
+list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
+
+if(TARGET_TYPE_VALID LESS 0)
+  message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
+endif()
+
+# Git helpers
+#------------
+find_package(Git)
+if(GIT_FOUND)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} show
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    RESULT_VARIABLE IS_GIT_REPOSIITORY
+    OUTPUT_QUIET ERROR_QUIET)
+  if(${IS_GIT_REPOSIITORY} EQUAL 0)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --abbrev=0
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      OUTPUT_VARIABLE GITTAG
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but
+    # that might not always be available
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      COMMAND wc -l
+      OUTPUT_VARIABLE GITCOUNT
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --match 1 --always  --long --dirty
+      WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+      OUTPUT_VARIABLE GITVERSION
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+endif(GIT_FOUND)
+
+
+option(DSA_XENGINE_BUILD_ALL_TESTS "build tests by default" ON)
+option(DSA_XENGINE_INSTALL_ALL_TESTS "install tests by default" ON)
+option(DSA_XENGINE_BUILD_SHAREDLIB "build dsaXengine as a shared lib" ON)
+
+
+# Use ExternalProject_Add for libtcc (borks with FetchContent)
+# Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch)
+include(ExternalProject)
+
+# Use FetchContent for lightweight dependencies
+include(FetchContent)
+
+# CUDA based dependencies and options
+#------------------------------------
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+
+  # CUDA specific part of CMakeLists
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+
+  # Get GPU architecture from environmen, or set default (sm_80)
+  if(DEFINED ENV{DSA_XENGINE_GPU_ARCH})
+    set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH})
+  else()
+    set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_80)
+  endif()
+  
+  if(NOT DSA_XENGINE_GPU_ARCH)
+    message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}")
+  endif()
+  
+  set(DSA_XENGINE_GPU_ARCH
+    ${DSA_XENGINE_DEFAULT_GPU_ARCH}
+    CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
+  set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
+  set(DSA_XENGINE_GPU_ARCH_SUFFIX
+    ""
+    CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
+  set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
+  #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
+  #mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
+  #mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+  
+  # Set CUDA based methods and dependencies
+  #----------------------------------------
+  
+  # This is the default GPU method
+  option(DSA_XENGINE_ENABLE_CUBLAS "Use cuBLAS for correlatorss" ON)
+
+  # All other GPU methods can be enabled at compile time and
+  # toggled for use at run time, if enabled.
+  
+  # Get TCC dependency
+  option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF)
+  if(DSA_XENGINE_ENABLE_TCC)
+    add_compile_definitions(DSA_XENGINE_ENABLE_TCC)
+    option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
+    if(DSA_XENGINE_DOWNLOAD_TCC)
+      ExternalProject_Add(TCC
+	GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
+	#GIT_TAG        11d8a4a504d7073a2a33b81e1e387b12e58a420c
+	CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(libtcc REQUIRED)
+    endif()
+  endif()
+  
+  # Get CUTLASS dependency
+  option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF)
+  if(DSA_XENGINE_ENABLE_CUTLASS)
+    add_compile_definitions(DSA_XENGINE_ENABLE_CUTLASS)
+    option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
+    if(DSA_XENGINE_DOWNLOAD_CUTLASS)
+      # Custom CUTLASS build
+      ExternalProject_Add(NvidiaCutlass
+	GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+	GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+	CMAKE_ARGS
+	"-DCUTLASS_NVCC_ARCHS_ENABLED=89"
+	"-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex"
+	"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(NvidiaCutlass REQUIRED)
+    endif()
+  endif()
+  
+  # Get MAGMA dependency
+  option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF)
+  if(DSA_XENGINE_ENABLE_MAGMA)
+    add_compile_definitions(DSA_XENGINE_ENABLE_MAGMA)
+    option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF)
+    if(DSA_XENGINE_DOWNLOAD_MAGMA)
+      # Custom MAGMA build
+      ExternalProject_Add(Magma
+	URL https://icl.utk.edu/projectsfiles/magma/downloads/magma-2.8.0.tar.gz
+	CMAKE_ARGS
+	"-DMAGMA_ENABLE_CUDA=ON"
+	"-DGPU_TARGET=sm_80"
+	"-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+	)
+    else()
+      find_package(Magma REQUIRED)
+    endif()
+  endif()
+  
+  # Get XGPU dependency (fix install)
+  option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF)
+  if(DSA_XENGINE_ENABLE_XGPU)
+    add_compile_definitions(DSA_XENGINE_ENABLE_XGPU)
+    option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF)
+    if(DSA_XENGINE_DOWNLOAD_XGPU)
+      # Download, build and install
+      FetchContent_Declare(    
+	xGPU
+	GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
+	#GIT_TAG        13b7fff1eac497236eb9c38e179aed3b532a88f2
+	)
+      FetchContent_MakeAvailable(XGPU)
+    else()
+      # Find and link to local install
+      find_package(xGPU REQUIRED)
+    endif()
+  endif()
+
+endif() # CUDA functionality
+
+# Get CPU based dependencies
+# Get OPENBLAS dependency
+option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF)
+if(DSA_XENGINE_ENABLE_OPENBLAS)
+  add_compile_definitions(DSA_XENGINE_ENABLE_OPENBLAS)
+  option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF)
+  if(DSA_XENGINE_DOWNLOAD_OPENBLAS)
+    # Custom OPENBLAS build
+    ExternalProject_Add(Openblas
+      GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+      GIT_TAG ce3f668
+      CMAKE_ARGS
+      #"-DOPENBLAS_ENABLE_CUDA=ON"
+      #"-DGPU_TARGET=sm_80"
+      "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+      )
+  else()
+    find_package(Openblas REQUIRED)
+  endif()
+endif()
+
+# Get psrdada dependency
+option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for IO" ON)
+option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
+if(DSA_XENGINE_DOWNLOAD_PSRDADA)
+  # Download, build and install
+  FetchContent_Declare(
+    PSRDada
+    GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code
+    )
+  FetchContent_MakeAvailable(PSRDada)
+else()
+  # Find and link to local install
+  find_package(PSRDada REQUIRED)
+endif()
+
+# Get HDF5 dependency
+option(DSA_XENGINE_ENABLE_HDF5 "Use HDF5 for data IO" OFF)
+if(DSA_XENGINE_ENABLE_HDF5)
+  option(DSA_XENGINE_DOWNLOAD_HDF5 "Download and build HDf5" OFF)
+  if(DSA_XENGINE_DOWNLOAD_HDF5)
+    # Download, build and install
+    FetchContent_Declare(
+      HDF5
+      GIT_REPOSITORY https://github.com/HDFGroup/hdf5.git
+      GIT_TAG 5794814
+      )
+    FetchContent_MakeAvailable(HDF5)
+  else()
+    # Find and link to local install
+    find_package(HDF5 REQUIRED)
+  endif()
+endif()
+
+# Get CLI11 dependency
+# FIX ME: get static .hpp version and ship with package
+option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON)
+if(DSA_XENGINE_ENABLE_CLI11)
+  option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON)
+  if(DSA_XENGINE_DOWNLOAD_CLI11)
+    # Download, build and install
+    FetchContent_Declare(
+      CLI11
+      GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+      GIT_TAG main
+      )
+    FetchContent_MakeAvailable(CLI11)
+  else()
+    # Find and link to local install
+    find_package(CLI11 REQUIRED)
+  endif()
+endif()
+
+
+# Get ZFP dependency
+option(DSA_XENGINE_ENABLE_ZFP "Enable ZFP" OFF)
+if(DSA_XENGINE_ENABLE_ZFP)
+  option(DSA_XENGINE_DOWNLOAD_ZFP "Download and build ZFP" OFF)
+  if(DSA_XENGINE_DOWNLOAD_ZFP)
+    # Download, build and install
+    FetchContent_Declare(
+      ZFP
+      GIT_REPOSITORY https://github.com/LLNL/zfp.git
+      GIT_TAG f40868a
+      )
+    FetchContent_MakeAvailable(ZFP)
+  else()
+    # Find and link to local install
+    find_package(ZFP REQUIRED)
+  endif()
+endif()
+
+# Add src, include, tests, and legacy
+add_subdirectory(src)
+add_subdirectory(include)
+add_subdirectory(tests)
+option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
+if(DSA_XENGINE_BUILD_LEGACY)
+  add_subdirectory(legacy)
+endif()
+
+# Install project cmake targets
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  ${PROJECT_NAME}-config-version.cmake
+  VERSION ${DSA_XENGINE_VERSION}
+  COMPATIBILITY AnyNewerVersion
+  )
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+  )
diff --git a/README.md b/README.md
index 03fe5e3..f771017 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 # dsa110-xengine
 
+<p align="center">
+    <a href="https://github.com/dsa110/dsa110-xengine/commits/master"><img src="https://img.shields.io/github/last-commit/dsa110/dsa110-xengine" alt="GitHub last commit"></a>
+    <a href="https://github.com/dsa110/dsa110-xengine/commits/master"><img src="https://img.shields.io/github/commit-activity/y/dsa110/dsa110-xengine" alt="GitHub commit activity the past week"></a>
+</p>
 
 This repo contains code used for the DSA X-engine. The requirements are to:
  - capture SNAP F-engine packets on an ethernet interface, and place them in a psrdada buffer
@@ -67,11 +71,4 @@ Finally, `dsaX_dbnic` and `dsaX_nicdb` implement the corner turn to feed `mbheim
 
 ### scripts and utils
 
-The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights. 
-
-
-
- 
- 
- 
-
+The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights.
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..9a7cbbd
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,21 @@
+enable_language(CUDA)
+
+# install step for header files
+#------------------------------
+set(DSA_XENGINE_HEADERS
+  # cmake-format: sortable
+  dsaX.h
+  dsaX_def.h
+  dsaX_malloc.h
+  dsaX_ptr.h
+  fast_time_domain.h  
+  cuda_interface.h
+  cuda_handles.h
+  cuda_headers.h
+  dsaX_capture.h
+  dsaX_capture_manythread.h
+  dsaX_capture_pcap.h
+  cutlass_interface.h
+  )
+install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+#------------------------------
diff --git a/include/blas_interface.h b/include/blas_interface.h
new file mode 100644
index 0000000..d643e08
--- /dev/null
+++ b/include/blas_interface.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "interface.h"
+
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0);
diff --git a/include/cublas_interface.h b/include/cublas_interface.h
new file mode 100644
index 0000000..f68eea3
--- /dev/null
+++ b/include/cublas_interface.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "dsaX.h"
+
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream);
diff --git a/include/cuda_handles.h b/include/cuda_handles.h
new file mode 100644
index 0000000..eeaf706
--- /dev/null
+++ b/include/cuda_handles.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <vector>
+
+#include "utils.h"
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+#include "cuda_headers.h"
+
+static std::vector<cudaStream_t> streams;
+static cublasHandle_t cublasH = NULL;
+
+static bool cublas_init = false;
+static bool stream_init = false;
+
+cudaStream_t get_stream(unsigned int i);
+#endif
+
+void init_streams(unsigned int n_streams);
+void destroy_streams();
diff --git a/include/cuda_headers.h b/include/cuda_headers.h
new file mode 100644
index 0000000..333a5bc
--- /dev/null
+++ b/include/cuda_headers.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#if defined (DSA_XENGINE_TARGET_CUDA)
+#include <cuda.h>
+#include "cuda_fp16.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#endif
diff --git a/include/cuda_interface.h b/include/cuda_interface.h
new file mode 100644
index 0000000..42043e2
--- /dev/null
+++ b/include/cuda_interface.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <vector>
+
+#include "dsaX_def.h"
+#include "enums.h"
+#include "dsaX.h"
+
+void dsaXInitCuda(int dev);
+void dsaXDestroyCuda();
+
+void initBLASCuda();
+void destroyBLASCuda();
+
+void initStreamsCuda(unsigned int n);
+void destroyStreamsCuda();
+
+void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream);
+
+void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams);
+
+void initializeBFCudaMemory(bf_handle *d);
+
+void deallocateCorrCudaMemory(corr_handle *d);
+
+void deallocateBFCudaMemory(bf_handle *d);
+
+void dsaXmemsetCuda(void *array, int ch, size_t n);
+
+void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind, int stream);
+
+void *dsaXHostRegisterCuda(size_t size);
+
+void dsaXDeviceSynchronizeCuda();
+
+void reorderCorrOutputCuda(corr_handle *d, int stream);
+
+void computeIndicesCuda(corr_handle *d);
+
+void reorderCorrInputCuda(corr_handle *d, int stream);
+
+void calcWeightsCuda(bf_handle *d);
+
+template <typename in_prec, typename out_prec> void transposeMatrixCuda(in_prec *idata, out_prec *odata);
+
+void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int> &dim_block_in, std::vector<int> &dim_grid_in);
+
+void transposeScaleBeamformerCuda(void *real, void *imag, unsigned char *output, std::vector<int> &dim_block_in, std::vector<int> &dim_grid_in);
+
+void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb);
+
+void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb);
diff --git a/include/cuda_kernels.h b/include/cuda_kernels.h
new file mode 100644
index 0000000..d57a11b
--- /dev/null
+++ b/include/cuda_kernels.h
@@ -0,0 +1,340 @@
+#pragma once
+
+#include "cuda_headers.h"
+
+__global__ void inspectPackedDataInKernel(char input, int i) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+  
+  if(re != 0 || im != 0) printf("K val[%d] = (%f,%f)\n", i, re, im);
+}
+
+// KERNELS
+// DMH: Abstract hardcoded launch parameters
+__global__ void transpose_input_beamformer(double *idata, double *odata) {
+  
+  __shared__ double tile[16][17][4];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
+    tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
+    tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
+    tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8) {
+    odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
+    odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
+    odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
+    odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
+  }
+}
+
+// kernel to help with reordering output
+// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
+// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
+  
+  int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
+  int tidx = threadIdx.x; // assume 128
+  int idx = blockDim.x * bidx + tidx;
+  
+  int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
+  int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
+  int ch = (int)(chpol / 2);
+  int base_idx = indices_lookup[baseline];
+  int iidx = base_idx * NCHAN_PER_PACKET + ch;
+  int pol = (int)(chpol % 2);
+
+  float v1=0., v2=0.;
+
+  //if(idx<1) printf("output pre (%f, %f)\n", output[2*idx], output[2*idx+1]);
+  
+  // Use CUDA casting intrinsic __half2float
+  for (int i=0;i<halfFac;i++) {
+    v1 += __half2float(outr[(4*iidx+pol)*halfFac+i])+__half2float(outr[(4*iidx+2+pol)*halfFac+i]);
+    v2 += __half2float(outi[(4*iidx+pol)*halfFac+i])+__half2float(outi[(4*iidx+2+pol)*halfFac+i]);
+    //if(idx < 1) printf("real loop %d, (%f, %f)\n", i, __half2float(outr[(4*iidx+pol)*halfFac+i]), __half2float(outr[(4*iidx+2+pol)*halfFac+i]));
+    //if(idx < 1) printf("imag loop %d, (%f, %f)\n", i, __half2float(outi[(4*iidx+pol)*halfFac+i]), __half2float(outi[(4*iidx+2+pol)*halfFac+i]));
+    //if(idx < 1) printf("v1 = %f, v2 = %f\n", v1, v2);
+  }
+  
+  output[2*idx] = v1;
+  output[2*idx+1] = v2;
+  //if(idx<1) printf("output post (%f, %f)\n", output[2*idx], output[2*idx+1]);  
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+template <typename in_prec, typename out_prec> __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
+  
+  __shared__ in_prec tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+    //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x);
+  }
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_float(half * idata, half * odata) {
+  
+  __shared__ float tile[32][33];
+  
+  int x = blockIdx.x * 32 + threadIdx.x;
+  int y = blockIdx.y * 32 + threadIdx.y;
+  int width = gridDim.x * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //printf("K transpose_matrix_float_in[%d] =  %f\n", (y+j)*width + x, __half2float(idata[(y+j)*width + x]));
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 32 + threadIdx.y;
+  width = gridDim.y * 32;
+
+  for (int j = 0; j < 32; j += 8) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];    
+    //printf("K transpose_matrix_float_out[%d] =  %f\n", (y+j)*width + x, __half2float(odata[(y+j)*width + x]));
+  }
+}
+
+
+// DMH: TUNABLE
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_char(char * idata, char * odata) {
+  
+  __shared__ char tile[32][33];
+  //extern __shared__ char tile[];
+  
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.x + threadIdx.y;
+  int width = gridDim.x * blockDim.x;
+
+  for (int j = 0; j < blockDim.x; j += blockDim.y) {
+    tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+    //tile[(threadIdx.y+j)*blockDim.x + threadIdx.x] = idata[(y+j)*width + x];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
+  __syncthreads();
+
+  x = blockIdx.y * blockDim.x + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * blockDim.x + threadIdx.y;
+  width = gridDim.y * blockDim.x;
+
+  for (int j = 0; j < blockDim.x; j += blockDim.y) {
+    odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+    //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)];
+    //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+}
+
+
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr Half precision real array
+ * @param[out] ini Half precision imag array
+ * @param[in]  input Char precision complex array
+ */
+__global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini) {
+  
+  int bidx = blockIdx.x;  
+  int tidx = threadIdx.x; 
+  int iidx = blockDim.x * bidx + tidx;
+  
+  // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+  // to get real part 4 bit data.
+  // 0000rrrr
+  // Bit shift this result by 4 to the left.
+  // rrrr0000
+  // Cast to signed char.
+  // +-rrr0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000rrr
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  inr[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(15)  ) << 4) >> 4));
+
+  // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+  // to get imag part 4 bit data
+  // iiii0000.
+  // Cast to signed char
+  // +-iii0000
+  // Bitshift mantisa only to the right by 4 bits
+  // +-0000iii
+  // Cast to float and use CUDA intrinsic to cast to signed half
+  ini[iidx] = __float2half((float)((char)((   (unsigned char)(input[iidx]) & (unsigned char)(240)  )) >> 4));
+
+  //good
+  //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
+}
+
+// kernel to populate an instance of weights matrix
+// [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
+// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+// TUNABLE
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+  
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  int inidx = 128 * bidx + tidx;  
+  
+  // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
+  
+  // get indices
+  int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+  int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
+  int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
+  int bm = (int)(idx / (128*(NANTS/2)));
+  int tactp = (int)(idx % (128*(NANTS/2)));
+  //int t = (int)(tactp / (32*(NANTS/2)));
+  int actp = (int)(tactp % (32*(NANTS/2)));
+  int a = (int)(actp / 32);
+  int ctp = (int)(actp % 32);
+  //int c = (int)(ctp / 4);
+  int tp = (int)(ctp % 4);
+  //int t2 = (int)(tp / 2);
+  int pol = (int)(tp % 2);
+  int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
+  
+  // calculate weights
+  float theta, afac, twr, twi;
+  if (iArm==0) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_e[a+48*iArm]);
+    twi = sin(afac*antpos_e[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+  if (iArm==1) {
+    theta = sep*(127.-bm*1.)*PI/10800.; // radians
+    afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+    twr = cos(afac*antpos_n[a+48*iArm]);
+    twi = sin(afac*antpos_n[a+48*iArm]);
+    wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+    wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+    //wr[inidx] = __float2half(calibs[widx]);
+    //wi[inidx] = __float2half(calibs[widx+1]);
+  }
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+__global__ void fluff_input_beamformer(char * input, half * dr, half * di) {
+  
+  int bidx = blockIdx.x; 
+  int tidx = threadIdx.x;
+  int idx = blockDim.x * bidx + tidx;
+  
+  dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+  di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+
+  // Both results should be half (FP16) integers between -8 and 7.
+  //half re = dr[idx];
+  //half im = di[idx];
+  //half lim = 0;
+  //if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+  //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+  //}
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+__global__ void transpose_scale_beamformer(half * ir, half * ii, unsigned char * odata) {
+
+  __shared__ float tile[16][17];
+  
+  int x = blockIdx.x * 16 + threadIdx.x;
+  int y = blockIdx.y * 16 + threadIdx.y;
+  int width = gridDim.x * 16;
+  float dr, di;
+
+  for (int j = 0; j < 16; j += 8) {
+    dr = (float)(ir[(y+j)*width + x]);
+    di = (float)(ii[(y+j)*width + x]);
+    tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
+  }
+
+  __syncthreads();
+
+  x = blockIdx.y * 16 + threadIdx.x;  // transpose block offset
+  y = blockIdx.x * 16 + threadIdx.y;
+  width = gridDim.y * 16;
+
+  for (int j = 0; j < 16; j += 8)
+    odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+__global__ void sum_beam(unsigned char *input, float *output) {
+  
+  __shared__ float summ[512];
+  int bidx = blockIdx.x;
+  int tidx = threadIdx.x;
+  //int idx = bidx*256+tidx;
+  int bm = (int)(bidx/48);
+  int ch = (int)(bidx % 48);
+
+  summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
+
+  __syncthreads();
+
+  if (tidx<256) {
+    summ[tidx] += summ[tidx+256];
+    summ[tidx] += summ[tidx+128];
+    summ[tidx] += summ[tidx+64];
+    summ[tidx] += summ[tidx+32];
+    summ[tidx] += summ[tidx+16];
+    summ[tidx] += summ[tidx+8];
+    summ[tidx] += summ[tidx+4];
+    summ[tidx] += summ[tidx+2];
+    summ[tidx] += summ[tidx+1];
+  }
+
+  if (tidx==0) output[bidx] = summ[tidx];  
+}
diff --git a/include/cutlass_interface.h b/include/cutlass_interface.h
new file mode 100644
index 0000000..f95eeaa
--- /dev/null
+++ b/include/cutlass_interface.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  Status status;
+  cudaError_t error;
+  bool passed;
+  
+  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+  bool help;
+  GemmCoord problem_size;
+  int batch_count;
+  complex<float> alpha;
+  complex<float> beta;
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(256),
+    reference_check(false),
+    iterations(2),
+    alpha(1),
+    beta(0) { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    
+    CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "dsaX_cutlass_interface\n\n"
+	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+	<< "Options:\n\n"
+	<< "  --help                      If specified, displays this usage statement.\n\n"
+	<< "  --m=<int>                   GEMM M dimension\n"
+	<< "  --n=<int>                   GEMM N dimension\n"
+	<< "  --k=<int>                   GEMM K dimension\n"
+	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
+    
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+  // Half-precision input and output
+  using Element = half_t;
+  
+  // Configurations for layouts and internal computation
+  using LayoutA = ColumnMajor;
+  using LayoutB = ColumnMajor;
+  using LayoutC = ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  Handle handle;
+  
+  GemmCoord problem_size;
+  int batch_count;
+  DeviceAllocation<Element> tensor_A;
+  DeviceAllocation<Element> tensor_B;
+  DeviceAllocation<Element> tensor_C;
+  DeviceAllocation<Element> tensor_D;
+  DeviceAllocation<Element> tensor_D_ref;
+
+  DeviceAllocation<void *> ptr_A_real;
+  DeviceAllocation<void *> ptr_A_imag;
+  DeviceAllocation<void *> ptr_B_real;
+  DeviceAllocation<void *> ptr_B_imag;
+  DeviceAllocation<void *> ptr_C_real;
+  DeviceAllocation<void *> ptr_C_imag;
+  DeviceAllocation<void *> ptr_D_real;
+  DeviceAllocation<void *> ptr_D_imag;
+
+  Element *ptr_A;
+  Element *ptr_B;
+  Element *ptr_C;
+  Element *ptr_D;
+  
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+  
+  typename LayoutA::Stride::Index lda;
+  typename LayoutB::Stride::Index ldb;
+  typename LayoutC::Stride::Index ldc;
+  typename LayoutC::Stride::Index ldd;
+  
+  int64_t imag_stride_A;
+  int64_t imag_stride_B;
+  int64_t imag_stride_C;
+  int64_t imag_stride_D;
+  
+public:  
+  // Constructors
+  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+  DSA_FTD_ComplexGEMM_CUTLASS();
+  
+  // Methods
+  void initialize();  
+  Result run(Options const &options);
+  
+  bool testing;  
+};
+  
diff --git a/include/dsaX.h b/include/dsaX.h
new file mode 100644
index 0000000..f370bc0
--- /dev/null
+++ b/include/dsaX.h
@@ -0,0 +1,50 @@
+#pragma once 
+
+// Expose the use to compile time definitions,
+// enums, parameters, and classes
+#include "dsaX_def.h"
+#include "enums.h"
+#include "params.h"
+#include "fast_time_domain.h"
+
+// Use manual transpose route
+// Uncomment to try new pure cuBLAS
+//#define OLD_BLAS
+
+/**
+ * Initialize the library. This function will initialise
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas.
+ * @param[in] device_ordinal The GPU device to init
+ */
+void dsaXInit(int device_ordinal = -1);
+
+/**
+ * Finalize the library. This function will finalize
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas. It will also dump any statistics
+ * collected, such as performance metrics.
+ */
+void dsaXEnd();
+
+/**
+ * This function will allocate pinned device memory of the 
+ * given size in bytes, and return a void pointer to that
+ * memory. The user may delete the memory safely in their
+ * application code.
+ * @param[in] size The byte size of pinned memory to be allocated 
+ *                 by dsaX.
+ */
+void *dsaXHostRegister(size_t size);
+
+/**
+ * This function allows the user to inspect the (4b,4b) char sized
+ * complex data at byte address i on the host. If 'non-zero' is true
+ * then the complex element will print only if either the real
+ * or imaginary element is non-zero. Useful for checking if 
+ * an array is populated.
+ * @param[in] input    The (4b,4b) char input array
+ * @param[in] i        The ith element of the array
+ * @param[in] non-zero If true, print only elements with non-zero values
+ */
+void inspectPackedData(char input, int i, bool non_zero = false);
diff --git a/include/dsaX_api.h b/include/dsaX_api.h
new file mode 100644
index 0000000..3767600
--- /dev/null
+++ b/include/dsaX_api.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <string>
+
+#include "enums.h"
+
+#define STRINGIFY__(x) #x
+#define __STRINGIFY__(x) STRINGIFY__(x)
+
+/**
+   @brief Wrapper around cudaMemcpy or driver API equivalent
+   @param[out] dst Destination pointer
+   @param[in] src Source pointer
+   @param[in] count Size of transfer
+   @param[in] kind Type of memory copy
+*/
+void dsaXMemcpy_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const char *func, const char *file,
+		 const char *line);
+
+/**
+   @brief Wrapper around cudaMemcpyAsync or driver API equivalent
+   @param[out] dst Destination pointer
+   @param[in] src Source pointer
+   @param[in] count Size of transfer
+   @param[in] kind Type of memory copy
+   @param[in] stream Stream to issue copy
+*/
+void dsaXMemcpyAsync_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const cudaStream_t &stream,
+		      const char *func, const char *file, const char *line);
+
+
+#define dsaXMemcpy(dst, src, count, kind)                                                                              \
+  ::dsaXMemcpy_(dst, src, count, kind, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
+
+#define dsaXMemcpyAsync(dst, src, count, kind, stream)                                                                 \
+  ::dsaXMemcpyAsync_(dst, src, count, kind, stream, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
diff --git a/src/dsaX_capture.h b/include/dsaX_capture.h
similarity index 100%
rename from src/dsaX_capture.h
rename to include/dsaX_capture.h
diff --git a/src/dsaX_capture_manythread.h b/include/dsaX_capture_manythread.h
similarity index 100%
rename from src/dsaX_capture_manythread.h
rename to include/dsaX_capture_manythread.h
diff --git a/src/dsaX_capture_pcap.h b/include/dsaX_capture_pcap.h
similarity index 100%
rename from src/dsaX_capture_pcap.h
rename to include/dsaX_capture_pcap.h
diff --git a/include/dsaX_def.h b/include/dsaX_def.h
new file mode 100644
index 0000000..5b3af78
--- /dev/null
+++ b/include/dsaX_def.h
@@ -0,0 +1,100 @@
+#pragma once
+
+// default dada block keys
+#define TEST_BLOCK_KEY 0x0000aada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*4=37748736 for 4 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*8=75497472 for 1 CHANG 1 SNAP 8 REORDER
+#define CAPTURE_BLOCK_KEY 0x0000dada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 150994944 for doSnap
+#define REORDER_BLOCK_KEY 0x0000eada // for reorder program.
+// 589824 for doSnap
+#define REORDER_BLOCK_KEY2 0x0000bada // for reorder program 2.
+// 128*32*1536*16*2*2=402653184 1 REORDER
+// 3221225472 for 8 REORDERS
+#define XGPU_BLOCK_KEY 0x0000fada // for xgpu program. 
+// 136*1536*2*8=3342336 
+#define COPY_BLOCK_KEY 0x0000dbda // for split off data
+#define BF_BLOCK_KEY 0x0000dcda // for beamformed data
+#define BF_BLOCK_KEY2 0x0000bcda // for beamformed data testing
+#define CAPTURED_BLOCK_KEY 0x0000abda // for capture program.
+#define BEAMCAPTURE_BLOCK_KEY 0x0000bbda // for capture bf program.
+
+// constants
+#define PI 3.14159265359
+#define CVAC 299792458.0
+
+// default number of XGPU ints
+#define NCORRINTS 128
+#define NNATINTS 32 // native number of integrations
+#define NREORDERS 1 // number of ints per reorder
+
+// size of xgpu output
+// TODO
+#define XGPU_SIZE 835584 // size of single output vector (post-GPU)
+#define XGPU_IN_INC 1 // size of input increment
+#define NBASE 4656 // nant*(nant+1)/2
+#define NPOL 2
+#define NCOMPLEX 2 // two reals per complex
+#define NCHAN 1536 // regardless of NCHANG
+
+// default port for packet capture
+#define CAPTURE_PORT 4011
+
+// default UDP packet dims
+#define UDP_HEADER   8              // size of header/sequence number
+#define UDP_DATA     4608           // obs bytes per packet
+#define UDP_PAYLOAD  4616           // header + datasize
+
+// number of channel groups to expect
+#define NCHANG 1
+
+// number of SNAPs to expect
+#define NSNAPS 32
+
+/* expect consecutive channel groups */
+#define CHOFF 1024 // offset in channels of first group
+
+// default control ports
+#define CAPTURE_CONTROL_PORT 11223
+#define REORDER_CONTROL_PORT 11224
+#define XGPU_CONTROL_PORT 11225
+#define WRITEVIS_CONTROL_PORT 11226
+#define TRIGGER_CONTROL_PORT 11227
+
+#define NPACKETS_PER_CALL 2048
+#define NPACKETS_PER_BLOCK 2048
+#define NPACKETS_INTS 2048 // number of packets per xgpu int
+#define NPACKETS_PER_FIL 2
+#define NPACKETS 2048
+#define NOUTBLOCKS 15 // number of input blocks stored by trigger
+#define NANTS 96
+#define NCHAN_PER_PACKET 384
+#define NBEAMS 512
+
+// for beamformer
+//#define sep 1.0 // arcmin
+#define NW 48 // number of weights per 384 chans. Also the number of channels formed
+#define NANT 63
+#define BEAM_OUT 23
+#define NSTREAMS 4
+#define NBP 8 // number of previous BPs to average
+
+// for second corner turn
+#define FIL_PORT0 6625 // port for first chan group
+#define NCLIENTS 16 // number of client dbnic processes to expect
+#define NSAMPS_PER_BLOCK 16384 // number of samples per block
+#define NCHAN_FIL 1024 // final number of filterband chans
+#define NBEAMS_PER_BLOCK 64 // number of beams to expect
+#define NSAMPS_PER_TRANSMIT 512 // number of samples transmitted at one time
+#define NBMS 256
+#define P_SIZE 4108
+#define NWAIT 100000
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
+
diff --git a/include/dsaX_malloc.h b/include/dsaX_malloc.h
new file mode 100644
index 0000000..04d24b0
--- /dev/null
+++ b/include/dsaX_malloc.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <unistd.h>   // for getpagesize()
+#include <execinfo.h> // for backtrace
+#include <map>        // for std::map
+
+#include <dsaX.h>
+#include <enums.h>
+
+
+using namespace std;
+
+// strip path from __FILE__
+// DMH: Place somewhere more sensible when working
+constexpr const char *str_end(const char *str) { return *str ? str_end(str + 1) : str; }
+constexpr bool str_slant(const char *str) { return *str == '/' ? true : (*str ? str_slant(str + 1) : false); }
+constexpr const char *r_slant(const char *str) { return *str == '/' ? (str + 1) : r_slant(str - 1); }
+constexpr const char *file_name(const char *str) { return str_slant(str) ? r_slant(str_end(str)) : str; }
+
+// Define wrappers around function. May wish to place <function>_
+// methods in a dsaX namespace later
+void *pinned_malloc_(const char *func, const char *file, int line, size_t size);
+#define pinned_malloc(size) pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *device_malloc_(const char *func, const char *file, int line, size_t size);
+#define device_malloc(size) device_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size);
+#define device_pinned_malloc(size) device_pinned_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *safe_malloc_(const char *func, const char *file, int line, size_t size);
+#define safe_malloc(size) safe_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *mapped_malloc_(const char *func, const char *file, int line, size_t size);
+#define mapped_malloc(size) mapped_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void *managed_malloc_(const char *func, const char *file, int line, size_t size);
+#define managed_malloc(size) managed_malloc_(__func__, file_name(__FILE__), __LINE__, size)
+
+void managed_free_(const char *func, const char *file, int line, void *ptr);
+#define managed_free(ptr) managed_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void device_free_(const char *func, const char *file, int line, void *ptr);
+#define device_free(ptr) device_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void device_pinned_free_(const char *func, const char *file, int line, void *ptr);
+#define device_pinned_free(ptr) device_pinned_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+void host_free_(const char *func, const char *file, int line, void *ptr);
+#define host_free(ptr) host_free_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+/*
+  @brief Get device view of a host-mapped pointer
+*/
+void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *ptr);
+#define get_mapped_device_pointer(ptr) get_mapped_device_pointer_(__func__, file_name(__FILE__), __LINE__, ptr)
+
+// Create a mem_pool namespace to differentiate
+// bewtween regular memory management methods
+// and those utilising memory pooling
+namespace mem_pool {
+
+  /**
+     @brief Initialize the memory pool allocator
+  */
+  void init();
+  
+  /**
+     @brief Allocate device-memory.  If free pre-existing allocation exists
+     reuse this.
+     @param size Size of allocation
+     @return Pointer to allocated memory
+  */
+  void *device_malloc_(const char *func, const char *file, int line, size_t size);
+  
+  /**
+     @brief Virtual free of pinned-memory allocation.
+     @param ptr Pointer to be (virtually) freed
+  */
+  void device_free_(const char *func, const char *file, int line, void *ptr);
+  
+  /**
+     @brief Allocate pinned-memory.
+     If a free pre-existing allocation exists, reuse this.
+     @param size Size of allocation
+     @return Pointer to allocated memory
+  */
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t size);
+  
+  /**
+     @brief Virtual free of pinned-memory allocation.
+     @param ptr Pointer to be (virtually) freed
+  */
+  void pinned_free_(const char *func, const char *file, int line, void *ptr);
+
+  /**
+     @brief Free all outstanding device-memory allocations.
+  */
+  void flush_device();
+  
+  /**
+     @brief Free all outstanding pinned-memory allocations.
+  */
+  void flush_pinned();  
+}
+
+#define pool_device_malloc(size) mem_pool::device_malloc_(__func__, __FILE__, __LINE__, size)
+#define pool_device_free(ptr) mem_pool::device_free_(__func__, __FILE__, __LINE__, ptr)
+#define pool_pinned_malloc(size) mem_pool::pinned_malloc_(__func__, __FILE__, __LINE__, size)
+#define pool_pinned_free(ptr) mem_pool::pinned_free_(__func__, __FILE__, __LINE__, ptr)
+
diff --git a/include/dsaX_ptr.h b/include/dsaX_ptr.h
new file mode 100644
index 0000000..de452f0
--- /dev/null
+++ b/include/dsaX_ptr.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <ostream>
+#include "dsaX_malloc.h"
+
+/**
+   Object that stores a memory allocation with different views for
+   host or device.  Depending on the nature of the underlying memory
+   type, both views may not be defined
+
+   type                       defined views
+   DSAX_MEMORY_DEVICE         device only
+   DSAX_MEMORY_DEVICE_PINNED  device only
+   DSAX_MEMORY_HOST           host only
+   DSAX_MEMORY_HOST_PINNED    both
+   DSAX_MEMORY_MAPPED         both (pinned to host)
+   DSAX_MEMORY_MANAGED        both
+*/
+class dsaX_ptr
+{
+  friend std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr);
+  dsaXMemoryType type = DSA_MEMORY_INVALID;  /** Memory type of the allocation */
+  size_t size = 0;                           /** Size of the allocation */
+  bool pool = false;                         /** Is the allocation is pooled */
+  void *device = nullptr;                    /** Device-view of the allocation */
+  void *host = nullptr;                      /** Host-view of the allocation */
+  bool reference = false;                    /** Is this a reference to another allocation */
+
+  /**
+     @brief Internal deallocation routine
+  */
+  void destroy();
+
+public:
+  dsaX_ptr() = default;
+  dsaX_ptr(dsaX_ptr &&) = default;
+  dsaX_ptr &operator=(dsaX_ptr &&);
+  dsaX_ptr(const dsaX_ptr &) = delete;
+  dsaX_ptr &operator=(const dsaX_ptr &) = delete;
+
+  /**
+     @brief Constructor for dsaX_ptr
+     @param[in] type The memory type of the allocation
+     @param[in] size The size of the allocation
+     @param[in] pool Whether the allocation should be in the memory pool (default is true)
+  */
+  dsaX_ptr(dsaXMemoryType type, size_t size, bool pool = true);
+
+  /**
+     @brief Constructor for dsaX_ptr where we are wrapping a non-owned pointer
+     @param[in] ptr Raw base pointer
+     @param[in] type The memory type of the allocation
+  */
+  dsaX_ptr(void *ptr, dsaXMemoryType type);
+
+  /**
+     @brief Destructor for the dsaX_ptr
+  */
+  virtual ~dsaX_ptr();
+
+  /**
+     @brief Specialized exchange function to use in place of
+     std::exchange when exchanging dsaX_ptr objects: moves obj to
+     *this, and moves new_value to obj
+     @param[in,out] obj
+     @param[in] new_value New value for obj to take
+  */
+  void exchange(dsaX_ptr &obj, dsaX_ptr &&new_value);
+
+  /**
+     @return Returns true if allocation is visible to the device
+  */
+  bool is_device() const;
+
+  /**
+     @return Returns true if allocation is visible to the host
+  */
+  bool is_host() const;
+
+  /**
+     Return view of the pointer.  For mapped memory we return the device view.
+  */
+  void *data() const;
+
+  /**
+     Return the device view of the pointer
+  */
+  void *data_device() const;
+
+  /**
+     Return the host view of the pointer
+  */
+  void *data_host() const;
+
+  /**
+     Return if the instance is a reference rather than an allocation
+  */
+  bool is_reference() const;
+};
+
+std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr);
+
diff --git a/include/enums.h b/include/enums.h
new file mode 100644
index 0000000..aa86573
--- /dev/null
+++ b/include/enums.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#define DSA_INVALID_ENUM (-0x7fffffff - 1)
+
+typedef enum dsaXMemoryType_s {
+  DSA_MEMORY_DEVICE,
+  DSA_MEMORY_DEVICE_PINNED,
+  DSA_MEMORY_HOST,
+  DSA_MEMORY_HOST_PINNED,
+  DSA_MEMORY_MAPPED,
+  DSA_MEMORY_MANAGED,
+  DSA_MEMORY_INVALID = DSA_INVALID_ENUM
+} dsaXMemoryType;
+
+typedef enum dsaXError_t {
+  DSA_SUCCESS = 0,
+  DSA_ERROR = 1,
+  DSA_ERROR_UNINITIALIZED = 2,
+  DSA_ERROR_INVALID = DSA_INVALID_ENUM
+} dsaXError;
+
+typedef enum dsaXBLASOperation_s {				 
+  DSA_BLAS_OP_N = 0, // No transpose
+  DSA_BLAS_OP_T = 1, // Transpose only
+  DSA_BLAS_OP_A = 2, // Adjoint imaginary, no transpose
+  DSA_BLAS_OP_C = 3, // Conjugate transpose
+  DSA_BLAS_OP_INVALID = DSA_INVALID_ENUM
+} dsaXBLASOperation;
+
+typedef enum dsaXBLASType_s {
+  DSA_BLAS_GEMM = 0,
+  DSA_BLAS_INVALID = DSA_INVALID_ENUM
+} dsaXBLASType;
+
+typedef enum dsaXBLASLib_s {
+  DSA_BLAS_LIB_CUBLAS = 0,
+  DSA_BLAS_LIB_MAGMA  = 1,
+  DSA_BLAS_LIB_CUTLASS = 2,
+  DSA_BLAS_LIB_TCC = 3, 
+  DSA_BLAS_LIB_OPENBLAS = 4,
+  DSA_BLAS_LIB_NATIVE = 5, 
+  DSA_BLAS_LIB_INVALID = DSA_INVALID_ENUM  
+} dsaXBLASLib;
+
+typedef enum dsaXBLASDataType_s {				
+  DSA_BLAS_DATATYPE_H = 0, // Half
+  DSA_BLAS_DATATYPE_S = 1, // Single
+  DSA_BLAS_DATATYPE_D = 2, // Double
+  DSA_BLAS_DATATYPE_HC = 3, // Complex(half)
+  DSA_BLAS_DATATYPE_C = 4, // Complex(single)
+  DSA_BLAS_DATATYPE_Z = 5, // Complex(double)
+  DSA_BLAS_DATATYPE_4b_REAL = 6, // 4b sized real
+  DSA_BLAS_DATATYPE_2b_REAL = 7, // 2b sized real
+  DSA_BLAS_DATATYPE_4b_COMPLEX = 8, // Char sized complex (4b,4b)
+  DSA_BLAS_DATATYPE_2b_COMPLEX = 9, // 4b sized (2b,2b)  
+  DSA_BLAS_DATATYPE_INVALID = DSA_INVALID_ENUM
+} dsaXBLASDataType;
+
+typedef enum dsaXBLASDataOrder_s {
+  DSA_BLAS_DATAORDER_ROW = 0,
+  DSA_BLAS_DATAORDER_COL = 1,
+  DSA_BLAS_DATAORDER_INVALID = DSA_INVALID_ENUM
+} dsaXBLASDataOrder;
+
+typedef enum dsaXMemcpyKind_s {
+  dsaXMemcpyHostToHost = 0,
+  dsaXMemcpyHostToDevice = 1,
+  dsaXMemcpyDeviceToHost = 2,
+  dsaXMemcpyDeviceToDevice = 3,
+  dsaXMemcpyHostToHostAsync = 4,
+  dsaXMemcpyHostToDeviceAsync = 5,
+  dsaXMemcpyDeviceToHostAsync = 6,
+  dsaXMemcpyDeviceToDeviceAsync = 7,
+  dsaXMemcpyInvalid = DSA_INVALID_ENUM
+} dsaXMemcpyKind;
+
diff --git a/include/fast_time_domain.h b/include/fast_time_domain.h
new file mode 100644
index 0000000..98ce8ff
--- /dev/null
+++ b/include/fast_time_domain.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include "enums.h"
+#include "params.h"
+#include "timer.h"
+
+using ms = std::chrono::microseconds;
+using hrc = std::chrono::high_resolution_clock;
+
+// define structures that carry around memory pointers
+// and metric.
+// DMH: make a base and inherit into corr and bf
+typedef struct corr_handle_s {
+  
+  // initial data and streams
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  
+  // DMH: fix me
+  void *d_idxs;
+  
+  // correlator pointers
+  // giant array for r and i: [NCHAN_PER_PACKET, 2 pol, NANTS_PROCESS, NPACKETS_PER_BLOCK *2 times]
+  void *d_r, *d_i; //half
+  // arrays for matrix multiply output: input [NANTS_PROCESS, NANTS_PROCESS]
+  void *d_outr, *d_outi, *d_tx_outr, *d_tx_outi; //half
+  // giant output array: [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+  float *d_output;
+
+  dsaXCorrParam corr_param;
+
+  double device_compute_flops;
+  double host_compute_flops;
+  
+  double H2D_bytes;
+  double D2H_bytes;
+  double D2D_bytes;
+  double H2H_bytes;
+
+  // See 'using' at top of file for ms, hrc
+  timer::Timer<ms, hrc> dev_compute_timer;
+  timer::Timer<ms, hrc> dev_malloc_timer;
+  timer::Timer<ms, hrc> dev_memset_timer;
+  
+  timer::Timer<ms, hrc> H2D_timer;
+  timer::Timer<ms, hrc> D2H_timer;
+  timer::Timer<ms, hrc> D2D_timer;
+  timer::Timer<ms, hrc> H2H_timer;
+  
+  timer::Timer<ms, hrc> host_compute_timer;  
+  timer::Timer<ms, hrc> host_malloc_timer;
+  timer::Timer<ms, hrc> host_memset_timer;
+  timer::Timer<ms, hrc> host_copy_timer;
+  
+} corr_handle;
+
+typedef struct bf_handle_s {
+
+  // beamformer pointers
+  char *h_input;        // host input pointer
+  char *d_input, *d_tx; // [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+  char *d_big_input;
+  void *d_br, *d_bi; //half
+  void *weights_r, *weights_i; //weights: [arm, tactp, b] //half
+  void *d_bigbeam_r, *d_bigbeam_i; //output: [tc, b] //half
+  unsigned char *d_bigpower; //output: [b, tc]
+  float *d_scf; // scale factor per beam
+  float *d_chscf;
+  float *h_winp;
+  int *flagants, nflags;
+  float *h_freqs, *d_freqs;
+
+  // timing (old)
+  float cp, prep, cubl, outp;
+
+  // See 'using' at top of file ms, hrc
+  timer::Timer<ms, hrc> dev_compute_timer;
+  timer::Timer<ms, hrc> dev_malloc_timer;
+  timer::Timer<ms, hrc> dev_memset_timer;
+    
+  timer::Timer<ms, hrc> H2D_timer;
+  timer::Timer<ms, hrc> D2H_timer;
+  
+  timer::Timer<ms, hrc> host_compute_timer;  
+  timer::Timer<ms, hrc> host_malloc_timer;
+  timer::Timer<ms, hrc> host_memset_timer;
+  timer::Timer<ms, hrc> host_copy_timer;
+  
+} bf_handle;
+
+// Deprecated function, remove after development
+void dcorrelator(corr_handle *d);
+
+// Base class
+class dsaXBase {
+  
+ private:
+ protected:
+
+ public:
+  dsaXBase();  
+  ~dsaXBase();
+  
+};
+
+class Correlator : public dsaXBase {
+  
+private:
+protected:
+
+  corr_handle d;  
+  dsaXCorrParam corr_param;
+  dsaXBLASParam blas_param;
+
+  uint64_t flops;
+  
+public:
+  
+  // Constructor
+  // Initialise device memory if CUDA enabled
+  // make host memory if CPU
+  Correlator(const dsaXCorrParam *corr_param);
+
+  // Compute the FX correlator on input,
+  // place result in output.
+  void compute(void *output, void *input);
+  
+  ~Correlator();  
+};
+
+
+void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams);
+void destroyDsaXCorrDeviceMemory(corr_handle *d);
+void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int n_streams);
+
+void initBLAS();
+void destroyBLAS();
+
+void initStreams(unsigned int n);
+void destroyStreams();
+
+void computeIndices(corr_handle *d);
+void reorderCorrelatorOutput(corr_handle *d, int stream);
+void reorderCorrelatorInput(corr_handle *d, int stream);
+
diff --git a/include/interface.h b/include/interface.h
new file mode 100644
index 0000000..96442d1
--- /dev/null
+++ b/include/interface.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <vector>
+#include "dsaX.h"
+
+// DMH: decorate these with Doxygen
+void dsaXCorrelator(void *input_data, void *output_data);
+
+void reorderCorrInput(corr_handle *d, int stream = 0);
+
+void reorderCorrOutput(corr_handle *d, int stream = 0);
+
+void transposeInputBeamformer(double *input, double *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+
+void transposeScaleBeamformer(void *array_real, void *array_imag, unsigned char *output, std::vector<int> &dimBlock, std::vector<int> &dimGrid);
+
+void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb);
+
+void sumBeam(unsigned char *input, float *output, int blocks, int tpb);
+
+void dsaXInitStream(unsigned int n_streams);
+
+//void *dsaXHostRegister(size_t size);
diff --git a/include/magma_headers.h b/include/magma_headers.h
new file mode 100644
index 0000000..e9750c8
--- /dev/null
+++ b/include/magma_headers.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#if defined (DSA_XENGINE_ENABLE_MAGMA) 
+#include "magma_v2.h"
+#endif
diff --git a/include/magma_interface.h b/include/magma_interface.h
new file mode 100644
index 0000000..12f0cc7
--- /dev/null
+++ b/include/magma_interface.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "dsaX.h"
+
+void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param);
diff --git a/include/params.h b/include/params.h
new file mode 100644
index 0000000..08ff440
--- /dev/null
+++ b/include/params.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <complex>
+
+#include "enums.h"
+
+// Structure that carries BLAS parameters
+// This should be able to communicate to all
+// backend choices of BLAS library
+typedef struct dsaXBLASParam_s {  
+  size_t struct_size; /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaXBLASType blas_type;    /**< Type of BLAS computation to perform */
+
+  dsaXBLASLib blas_lib;      /**< Which BLAS library to use for BLAS ops */
+  
+  // GEMM params
+  dsaXBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */
+  dsaXBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */
+  int m;                     /**< number of rows of matrix op(A) and C. */
+  int n;                     /**< number of columns of matrix op(B) and C. */
+  int k;                     /**< number of columns of op(A) and rows of op(B). */
+  int lda;                   /**< leading dimension of two-dimensional array used to store the matrix A. */
+  int ldb;                   /**< leading dimension of two-dimensional array used to store matrix B. */
+  int ldc;                   /**< leading dimension of two-dimensional array used to store matrix C. */
+  long long int a_offset;    /**< position of the A array from which begin read/write. */
+  long long int b_offset;    /**< position of the B array from which begin read/write. */
+  long long int c_offset;    /**< position of the C array from which begin read/write. */
+  long long int a_stride;    /**< stride of the A array in strided(batched) mode */
+  long long int b_stride;    /**< stride of the B array in strided(batched) mode */
+  long long int c_stride;    /**< stride of the C array in strided(batched) mode */
+  std::complex<double> alpha;     /**< scalar used for multiplication. */
+  std::complex<double>  beta;     /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */
+  
+  // Common params
+  int batch_count;              /**< number of pointers contained in arrayA, arrayB and arrayC. */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+  
+} dsaXBLASParam;
+
+// Structure that carries Correlator class parameters
+typedef struct dsaXCorrParam_s {  
+  size_t struct_size;           /**< Size of this struct in bytes.  Used to ensure that the host application and DSA see the same struct*/
+  
+  dsaXBLASLib blas_lib;         /**< Which BLAS library to use for BLAS ops */
+  dsaXBLASDataType data_type;   /**< Specifies if using S(C) or D(Z) BLAS type */
+  dsaXBLASDataOrder data_order; /**< Specifies if using Row or Column major */
+
+  int n_streams;                /**< The number streams over which to compute input data */
+  
+} dsaXCorrParam;
+
+// Parameter struct helper functions for user
+const char *getBLASLibString(dsaXBLASLib lib);
+const char *getBLASDataTypeString(dsaXBLASDataType type);
+const char *getBLASDataOrderString(dsaXBLASDataOrder order);
+void printDsaXBLASParam(const dsaXBLASParam param);
+void printDsaXCorrParam(const dsaXCorrParam param);
+
+// Create params
+dsaXCorrParam newDsaXCorrParam(void);
diff --git a/include/psrdada_utils.h b/include/psrdada_utils.h
new file mode 100644
index 0000000..2b60bf3
--- /dev/null
+++ b/include/psrdada_utils.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "dada_client.h"
+#include "dada_def.h"
+#include "dada_hdu.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ipcbuf.h"
+#include "dada_affinity.h"
+#include "ascii_header.h"
+#include "dsaX_def.h"
+#include "enums.h"
+
+void dsaX_dbgpu_cleanup (dada_hdu_t * in, dada_hdu_t * out);
+
+int dada_bind_thread_to_core(int core);
diff --git a/include/timer.h b/include/timer.h
new file mode 100644
index 0000000..6607d5d
--- /dev/null
+++ b/include/timer.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2023 by Mark Melton
+//
+
+#pragma once
+#include <atomic>
+#include <chrono>
+
+namespace timer {
+
+  template <typename T>
+  inline void doNotOptimizeAway(const T& val) {
+    asm volatile("" : : "r,m"(val) : "memory");
+  }
+
+#ifdef __clang__
+  template<class T>
+  inline void doNotOptimizeAway(T& value) {
+    asm volatile("" : "+r,m"(value) : : "memory");
+  }
+#else
+  template<class T>
+  inline void doNotOptimizeAway(T& value) {
+    asm volatile("" : "+m,r"(value) : : "memory");
+  }
+#endif
+
+  inline void doNotReorderBarrier() {
+    std::atomic_signal_fence(std::memory_order_acq_rel);
+  }
+
+  /// The Timer class template implements a timer designed for minimal
+  /// overhead, ad-hoc timing of code regions including micro-timing
+  /// down to single machine instructions.
+  template<class Duration = std::chrono::nanoseconds,
+	   class Clock = std::chrono::high_resolution_clock>
+  class Timer {
+  public:
+    using TimePoint = typename Clock::time_point;
+
+    /// Run the supplied `code` in a loop `n` times.
+    template<class Code>
+    Timer& run(size_t n, Code&& code) {
+      start();
+      for (auto i = 0ul; i < n; ++i) {
+	code();
+      }
+      stop(n);
+      return *this;
+    }
+
+    /// Start the timer.
+    void start() {
+      start_ = Clock::now();
+    }
+
+    /// Stop the timer indicating `n` operations.
+    auto stop(size_t n = 1) {
+      auto end = Clock::now();
+      iterations_ += n;
+      elapsed_ += std::chrono::duration_cast<Duration>(end - start_);
+      return elapsed_;
+    }
+
+    /// Return the average number of nanoseconds per operation.
+    auto elapsed_per_iteration() const {
+      return iterations_ > 0 ? (double)elapsed_.count() / iterations_ : 0.0;
+    }
+
+    /// Return the elapsed duration.
+    auto elapsed() const {
+      return elapsed_;
+    }
+
+    /// Return the iterations.
+    auto iterations() const {
+      return iterations_;
+    }
+    
+  private:
+    TimePoint start_{};
+    Duration elapsed_{};
+    size_t iterations_{};
+  };
+
+}; // timer
diff --git a/include/utils.h b/include/utils.h
new file mode 100644
index 0000000..96a7004
--- /dev/null
+++ b/include/utils.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "params.h"
+#include "timer.h"
+
+void dsaXmemset(void *array, int ch, size_t n);
+
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream = 0);
+
+void dsaXDeviceSynchronize();
diff --git a/legacy/10_planar_complex.cu b/legacy/10_planar_complex.cu
new file mode 100644
index 0000000..9e0915d
--- /dev/null
+++ b/legacy/10_planar_complex.cu
@@ -0,0 +1,567 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex GEMM
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting
+  the batched strided mode.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile
+  size, and alignment. This can result in long compile times.
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+    
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
+
+    $ make 10_planar_complex
+
+    $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "10_planar_complex example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/10_planar_complex/10_planar_complex  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<ElementA> tensor_A;
+  cutlass::DeviceAllocation<ElementB> tensor_B;
+  cutlass::DeviceAllocation<ElementC> tensor_C;
+  cutlass::DeviceAllocation<ElementC> tensor_D;
+  cutlass::DeviceAllocation<ElementC> tensor_D_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched strided GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    ElementA *ptr_A = tensor_A.get();
+    ElementB *ptr_B = tensor_B.get();
+    ElementC *ptr_C = tensor_C.get();
+    ElementC *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMMs
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex(
+        problem_size.m(),                                 // GEMM M dimension
+        problem_size.n(),                                 // GEMM N dimension
+        problem_size.k(),                                 // GEMM K dimension
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+        ptr_A,                                            // Pointer to real part of A matrix
+        ptr_A + imag_stride_A,                            // Pointer to imaginary part of A matrix
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+        ptr_B,                                            // Pointer to real part of B matrix
+        ptr_B + imag_stride_B,                            // Pointer to imaginary part of B matrix
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C,                                            // Pointer to real part of C matrix
+        ptr_C + imag_stride_C,                            // Pointer to imaginary part of C matrix
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D,                                            // Pointer to real part of D matrix
+        ptr_D + imag_stride_D,                            // Pointer to imaginary part of D matrix
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd,                                              // Leading dimension of imaginary part of D matrix
+
+        batch_count,                                      // Number of batched elements
+
+        batch_stride_A,                                   // Stride between batches of real parts of A matrix
+        batch_stride_A,                                   // Stride between batches of imaginary parts of A matrix
+
+        batch_stride_B,                                   // Stride between batches of real parts of B matrix
+        batch_stride_B,                                   // Stride between batches of imaginary parts of B matrix
+
+        batch_stride_C,                                   // Stride between batches of real parts of C matrix
+        batch_stride_C,                                   // Stride between batches of imaginary parts of C matrix
+
+        batch_stride_D,                                   // Stride between batches of real parts of D matrix
+        batch_stride_D                                    // Stride between batches of imaginary parts of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMMs are complete
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+        ElementC epsilon = 0.1_hf;
+        ElementC nonzero_floor = 0.1_hf;
+
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this test passes on older architectures even though its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/legacy/11_planar_complex_array.cu b/legacy/11_planar_complex_array.cu
new file mode 100644
index 0000000..ba94b60
--- /dev/null
+++ b/legacy/11_planar_complex_array.cu
@@ -0,0 +1,628 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex Array Example
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
+  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
+  in global memory.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
+
+    $ make 11_planar_complex_array
+
+    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "11_planar_complex_array example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  // Half-precision input and output
+  using Element = cutlass::half_t;
+
+  // Configurations for layouts and internal computation
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<Element> tensor_A;
+  cutlass::DeviceAllocation<Element> tensor_B;
+  cutlass::DeviceAllocation<Element> tensor_C;
+  cutlass::DeviceAllocation<Element> tensor_D;
+  cutlass::DeviceAllocation<Element> tensor_D_ref;
+
+  cutlass::DeviceAllocation<void *> ptr_A_real;
+  cutlass::DeviceAllocation<void *> ptr_A_imag;
+  cutlass::DeviceAllocation<void *> ptr_B_real;
+  cutlass::DeviceAllocation<void *> ptr_B_imag;
+  cutlass::DeviceAllocation<void *> ptr_C_real;
+  cutlass::DeviceAllocation<void *> ptr_C_imag;
+  cutlass::DeviceAllocation<void *> ptr_D_real;
+  cutlass::DeviceAllocation<void *> ptr_D_imag;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched planar complex GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+
+    ptr_A_real.reset(batch_count);
+    ptr_A_imag.reset(batch_count);
+    ptr_B_real.reset(batch_count);
+    ptr_B_imag.reset(batch_count);
+    ptr_C_real.reset(batch_count);
+    ptr_C_imag.reset(batch_count);
+    ptr_D_real.reset(batch_count);
+    ptr_D_imag.reset(batch_count);
+
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    Element *ptr_A = tensor_A.get();
+    Element *ptr_B = tensor_B.get();
+    Element *ptr_C = tensor_C.get();
+    Element *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+    
+    //
+    // Configure pointers in global memory
+    //
+
+    struct {
+      Element *base;
+      void **ptr_real;
+      void **ptr_imag;
+      int64_t batch_stride;
+      int64_t imag_stride;
+    } tensors[] = {
+      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
+    };
+
+    for (auto const &tensor : tensors) {
+      for (int idx = 0; idx < batch_count; ++idx) {
+
+        void *ptr_real = tensor.base + idx * tensor.batch_stride;
+        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
+
+        cudaError_t error = cudaMemcpy(
+          tensor.ptr_real + idx,
+          &ptr_real,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+
+        error = cudaMemcpy(
+          tensor.ptr_imag + idx,
+          &ptr_imag,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+      }
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex array GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex_array(
+
+        problem_size.m(),                                 // expected GEMM M dimension
+        problem_size.n(),                                 // expected GEMM N dimension
+        problem_size.k(),                                 // expected GEMM K dimension
+        batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          Element, LayoutA,
+          Element, LayoutB,
+          Element, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+	Element epsilon = 0.1_hf;
+	Element nonzero_floor = 0.1_hf;
+	
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/legacy/CMakeLists.txt b/legacy/CMakeLists.txt
new file mode 100644
index 0000000..b456550
--- /dev/null
+++ b/legacy/CMakeLists.txt
@@ -0,0 +1,121 @@
+enable_language(CUDA)
+
+include_directories(../include)
+include_directories(${PSRDada_SOURCE_DIR}/src)
+include_directories(${xGPU_SOURCE_DIR}/src)
+
+set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+
+# DSA Fast Time Domain functions
+#-------------------------------
+add_executable(test_write test_write.c)
+target_link_libraries(test_write ${PSRDada_LIB})
+
+add_executable(test_read test_read.c)
+target_link_libraries(test_read ${PSRDada_LIB})
+
+add_executable(dsaX_trigger dsaX_trigger.c)
+target_link_libraries(dsaX_trigger ${PSRDada_LIB})
+
+add_executable(dsaX_filTrigger dsaX_filTrigger.c)
+target_link_libraries(dsaX_filTrigger ${PSRDada_LIB})
+
+# DMH: Has a 'sigproc' dependency, low priority
+if(0)
+  add_executable(splice_offline_beams splice_offline_beams.c)
+  target_link_libraries(splice_offline_beams ${PSRDada_LIB})
+
+  add_executable(dsaX_writeFil dsaX_writeFil.c)
+  target_link_libraries(dsaX_writeFil ${PSRDada_LIB})
+  
+  add_executable(dsaX_splice dsaX_splice.c)
+  target_link_libraries(dsaX_splice ${PSRDada_LIB})
+
+  add_executable(gpu_flagger gpu_flagger.cu)
+  target_link_libraries(gpu_flagger ${PSRDada_LIB})
+endif()
+
+add_executable(dsaX_store dsaX_store.c)
+target_link_libraries(dsaX_store ${PSRDada_LIB})
+
+add_executable(dsaX_fluff dsaX_fluff.c)
+target_link_libraries(dsaX_fluff ${PSRDada_LIB})
+
+# DMH: intrinsics compilation error
+#add_executable(dsaX_reorder dsaX_reorder.c)
+#target_link_libraries(dsaX_reorder ${PSRDada_LIB})
+
+# DMH: /scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c: In function ‘process’:
+#/scratch/CPviolator/work/DSA110/dsa110-xengine/src/dsaX_nicdb.c:145:65: warning: integer overflow in expression of type ‘int’ results in ‘-1073741824’ [-Woverflow]
+#  145 |   uint64_t shifty = (bdepth-1)*NSAMPS_PER_BLOCK*NBEAMS_PER_BLOCK*NCHAN_FIL;
+add_executable(dsaX_nicdb dsaX_nicdb.c)
+target_link_libraries(dsaX_nicdb ${PSRDada_LIB})
+
+add_executable(dsaX_dbnic dsaX_dbnic.c)
+target_link_libraries(dsaX_dbnic ${PSRDada_LIB})
+
+add_executable(dsaX_capture dsaX_capture.c)
+target_link_libraries(dsaX_capture ${PSRDada_LIB})
+
+add_executable(dsaX_capture_thread dsaX_capture_thread.c)
+target_link_libraries(dsaX_capture_thread ${PSRDada_LIB})
+
+add_executable(dsaX_capture_manythread dsaX_capture_manythread.c)
+target_link_libraries(dsaX_capture_manythread ${PSRDada_LIB})
+
+add_executable(dsaX_split dsaX_split.c)
+target_link_libraries(dsaX_split ${PSRDada_LIB} -lm)
+
+add_executable(dsaX_merge dsaX_merge.c)
+target_link_libraries(dsaX_merge ${PSRDada_LIB})
+
+add_executable(dsaX_simplesplit dsaX_simplesplit.c)
+target_link_libraries(dsaX_simplesplit ${PSRDada_LIB})
+
+add_executable(dsaX_fake dsaX_fake.c)
+target_link_libraries(dsaX_fake ${PSRDada_LIB})
+
+add_executable(dsaX_splitup dsaX_splitup.c)
+target_link_libraries(dsaX_splitup ${PSRDada_LIB})
+
+add_executable(dsaX_copydb dsaX_copydb.c)
+target_link_libraries(dsaX_copydb ${PSRDada_LIB})
+
+# DMH: fitsio dependency
+if(0)
+  add_executable(dsaX_writevis dsaX_writevis.c)
+  target_link_libraries(dsaX_writevis ${PSRDada_LIB})
+endif()
+
+# DMH: XGPU dependencies
+add_executable(dsaX_wrangle dsaX_wrangle.c)
+target_link_libraries(dsaX_wrangle ${PSRDada_LIB} ${CUDA_nvml_LIBRARY} ${XGPU_LIB})
+
+add_executable(dsaX_testdada dsaX_testdada.c)
+target_link_libraries(dsaX_testdada ${PSRDada_LIB})
+
+add_executable(dsaX_xgpu dsaX_xgpu.cu)
+target_link_libraries(dsaX_xgpu ${PSRDada_LIB} ${XGPU_LIB} ${CUDA_nvml_LIBRARY})
+
+add_executable(dsaX_cuda_correlator dsaX_cuda_correlator.cu)
+target_link_libraries(dsaX_cuda_correlator ${XGPU_LIB} ${CUDA_nvml_LIBRARY} ${CUDA_cublas_LIBRARY} ${PSRDada_LIB})
+
+add_executable(dsaX_reorder_raw dsaX_reorder_raw.c)
+target_link_libraries(dsaX_reorder_raw ${PSRDada_LIB})
+
+add_executable(fil2dada fil2dada.c)
+target_link_libraries(fil2dada ${PSRDada_LIB})
+
+add_executable(dumpfil dumpfil.c)
+target_link_libraries(dumpfil ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer dsaX_beamformer.cu)
+target_link_libraries(dsaX_beamformer ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_passon dsaX_beamformer_passon.cu)
+target_link_libraries(dsaX_beamformer_passon ${PSRDada_LIB})
+
+add_executable(dsaX_beamformer_offline dsaX_beamformer_offline.cu)
+target_link_libraries(dsaX_beamformer_offline ${PSRDada_LIB})
+#------------------------------------------------------
diff --git a/src/Makefile b/legacy/Makefile
similarity index 93%
rename from src/Makefile
rename to legacy/Makefile
index bbca4e0..4cc2fee 100644
--- a/src/Makefile
+++ b/legacy/Makefile
@@ -4,13 +4,13 @@ CC=gcc
 CFLAGS1 = -g -O3 -Wall -pthread -march=native -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc
 CDEPS1=dsaX_def.h dsaX_capture_manythread.h
 CDEPS2=dsaX_def.h dsaX_capture.h
-LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib -lcfitsio -lsigproc -lxgpu
+LIBS = -L/usr/local/lib -lpsrdada -L/usr/lib/gcc/x86_64-linux-gnu/5 -lgfortran -L/usr/local/cuda/lib64 -lcudart -lcublas -lm -L/usr/local/cfitsio-3.47/lib #-lcfitsio -lsigproc -lxgpu
 
 #LIBS2 = -L/home/ubuntu/PF_RING/userland/libpcap-1.9.1 -lpcap
 #CDEPS3=dsaX_def.h dsaX_capture_pcap.h
 
 CCU=/usr/local/cuda/bin/nvcc -D CUDA -ccbin=g++
-CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14
+CFLAGS2 = -I/home/ubuntu/proj/dsa110-shell/dsa110-xengine/src -I/home/dmhowart/install/include/ -I/home/ubuntu/proj/dsa110-shell/dsa110-xGPU/src -I/usr/local/include -I/usr/local/include/src -I/usr/local/cfitsio-3.47/include -I/home/ubuntu/proj/dsa110-shell/dsa110-sigproc -arch=sm_75 -O3 -Xcompiler="-pthread" -DMATRIX_ORDER_TRIANGULAR -std=c++14 -L/home/dmhowart/install/lib
 
 
 .DEFAULT_GOAL := all
@@ -63,7 +63,6 @@ dsaX_reorder.o: dsaX_reorder.c $(CDEPS1)
 dsaX_reorder: dsaX_reorder.o
 	$(CC) -o $@ $^ $(CFLAGS1) $(LIBS)
 
-
 dsaX_dbnic.o: dsaX_dbnic.c $(CDEPS1)
 	$(CC) -c -o $@ $< $(CFLAGS1)
 
diff --git a/legacy/correlator_header_dsaX.txt b/legacy/correlator_header_dsaX.txt
new file mode 100644
index 0000000..c8b86e9
--- /dev/null
+++ b/legacy/correlator_header_dsaX.txt
@@ -0,0 +1,38 @@
+ACC_LEN      1                 
+BANDWIDTH    -250                   
+BW           -250                   
+CFREQ        1405                   
+CHAN_AV      0                      
+DEC          00:00:00.000           
+DSB          0                      
+FILE_SIZE    2415919104             
+FREQ         1405.000000            
+FSCRUNCH     1                      
+HDR_SIZE     4096                   
+HDR_VERSION  1.0                    
+INSTRUMENT   DSAX                   
+MODE         RAW                  
+NBEAM        1                      
+NBIT         4                      
+NCHAN        2048                   
+NDIM         1                      
+NPOL         2                      
+N_PROD       1                      
+OBSERVER     DSA                    
+OBS_OFFSET   0                      
+OBS_UNIT     SECONDS                
+OBS_VAL      0000.0000              
+PID          P000                   
+RA           00:00:00.000           
+RECEIVER     SANDY                  
+RESOLUTION   4096                   
+SOURCE       TEST                   
+TRANSFER_SIZE 126562550000000       
+TELESCOPE    DSA-10                 
+TSAMP        64                     
+TSCRUNCH     1                      
+ANTENNAS     1-2-5-3               
+NANT         2                      
+UTC_START    2015-08-07-17:07:28    
+FILE_NUMBER  0                      
+
diff --git a/src/dsaX_beamformer.cu b/legacy/dsaX_beamformer.cu
similarity index 99%
rename from src/dsaX_beamformer.cu
rename to legacy/dsaX_beamformer.cu
index 5efcfca..afdda70 100644
--- a/src/dsaX_beamformer.cu
+++ b/legacy/dsaX_beamformer.cu
@@ -30,6 +30,9 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -811,7 +814,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -822,7 +825,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -871,7 +874,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
diff --git a/src/dsaX_beamformer.cu.wrk1 b/legacy/dsaX_beamformer.cu.wrk1
similarity index 100%
rename from src/dsaX_beamformer.cu.wrk1
rename to legacy/dsaX_beamformer.cu.wrk1
diff --git a/src/dsaX_beamformer_offline.cu b/legacy/dsaX_beamformer_offline.cu
similarity index 99%
rename from src/dsaX_beamformer_offline.cu
rename to legacy/dsaX_beamformer_offline.cu
index 13eab5e..c122d46 100644
--- a/src/dsaX_beamformer_offline.cu
+++ b/legacy/dsaX_beamformer_offline.cu
@@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -723,7 +725,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_out = 15*48*512*256;
   char * block;
   block = (char *)malloc(sizeof(char)*block_size);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
   uint64_t nbytes_per_out = block_out / nints;  
diff --git a/src/dsaX_beamformer_passon.cu b/legacy/dsaX_beamformer_passon.cu
similarity index 99%
rename from src/dsaX_beamformer_passon.cu
rename to legacy/dsaX_beamformer_passon.cu
index 7c8c254..818c28a 100644
--- a/src/dsaX_beamformer_passon.cu
+++ b/legacy/dsaX_beamformer_passon.cu
@@ -30,6 +30,8 @@ Second kernel will simply add times and adjacent channels and pick leading 8 bit
 Then copy back to specific locations in host to form final [beam, time, frequency] array, to be sent to corner turn.
 
  */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -721,7 +723,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -732,7 +734,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -743,7 +745,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out2  = dada_hdu_create ();
+  hdu_out2  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out2, out_key2);
   if (dada_hdu_connect (hdu_out2) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -809,7 +811,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
   uint64_t block_out2 = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out2->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   int nints = NPACKETS / 16;
   uint64_t nbytes_per_int = block_size / nints;
diff --git a/src/dsaX_bfCorr.cu b/legacy/dsaX_bfCorr.cu
similarity index 95%
rename from src/dsaX_bfCorr.cu
rename to legacy/dsaX_bfCorr.cu
index 94bee5e..0fabe1e 100644
--- a/src/dsaX_bfCorr.cu
+++ b/legacy/dsaX_bfCorr.cu
@@ -47,6 +47,26 @@ using std::endl;
 /* global variables */
 int DEBUG = 0;
 
+__device__ void inspectPackedDataInKernel(char input, int i) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+  if(re != 0 || im != 0) printf("val[%d] = (%f,%f)\n", i, re, im);
+}
+
+void inspectPackedData(char input, int i, bool non_zeros) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+  if(non_zeros) {
+    if(re != 0 || im != 0)
+      std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  } else {
+    std::cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << std::endl;
+  }
+}
+
+
 // define structure that carries around device memory
 typedef struct dmem {
 
@@ -254,6 +274,7 @@ __global__ void corr_input_copy(char *input, half *inr, half *ini) {
   inr[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(15)) << 4) >> 4));
   ini[iidx] = __float2half((float)((char)(((unsigned char)(input[iidx]) & (unsigned char)(240))) >> 4));
 
+  //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
 }
 
 
@@ -269,18 +290,21 @@ __global__ void transpose_matrix_char(char * idata, char * odata) {
   int y = blockIdx.y * 32 + threadIdx.y;
   int width = gridDim.x * 32;
 
-  for (int j = 0; j < 32; j += 8)
+  for (int j = 0; j < 32; j += 8) {
      tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
-
+     //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+  }
+  
   __syncthreads();
 
   x = blockIdx.y * 32 + threadIdx.x;  // transpose block offset
   y = blockIdx.x * 32 + threadIdx.y;
   width = gridDim.y * 32;
 
-  for (int j = 0; j < 32; j += 8)
+  for (int j = 0; j < 32; j += 8) {
      odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
-
+     //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x);
+  }
 }
 
 // arbitrary transpose kernel
@@ -319,7 +343,8 @@ void reorder_input(char *input, char * tx, half *inr, half *ini) {
 
   // transpose input data
   dim3 dimBlock(32, 8), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
-  transpose_matrix_char<<<dimGrid,dimBlock>>>(input,tx);
+  transpose_matrix_char<<<dimGrid,dimBlock>>>(input, tx);
+  // DMH good
   /*
   // set up for geam
   cublasHandle_t cublasH = NULL;
@@ -497,7 +522,6 @@ void dcorrelator(dmem * d) {
   d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
   
   // set up for gemm
-
   begin = clock();
   cublasHandle_t cublasH = NULL;
   cudaStream_t stream = NULL;
@@ -526,6 +550,10 @@ void dcorrelator(dmem * d) {
   const int batchCount = NCHAN_PER_PACKET*2*2*halfFac;
 
   // run strided batched gemm
+  // M^* M^T
+  // (a - ib)(a + ib)^T
+  // (aaT + bbT) + i(abT - bTa)
+  
   // ac
   cublasHgemmStridedBatched(cublasH,transa,transb,m,n,k,
 			    &alpha,d->d_r,lda,strideA,
@@ -1166,6 +1194,7 @@ int main (int argc, char *argv[]) {
     // allocate input
     d.h_input = (char *)malloc(sizeof(char)*in_block_size);
 
+    std::cout << "Size of input = " << in_block_size << std::endl;
     // loop over reps and chunks
     for (int reps=0; reps<nreps; reps++) {
 
@@ -1175,6 +1204,9 @@ int main (int argc, char *argv[]) {
 	if (chunks>0) rewind(fin);
 	fread(d.h_input+chunks*rd_size,rd_size,1,fin);
 
+	std::cout << "Input peek " << std::endl;
+	//for (int i=0; i<8; i++) inspectPackedData(d.h_input[i], i);
+	
 	// run correlator or beamformer, and output data
 	if (bf==0) {
 	  if (DEBUG) syslog(LOG_INFO,"run correlator");
@@ -1182,10 +1214,13 @@ int main (int argc, char *argv[]) {
 	  if (DEBUG) syslog(LOG_INFO,"copy to host");
 	  output_size = NBASE*NCHAN_PER_PACKET*2*2*4;
 	  output_data = (char *)malloc(output_size);
-	  cudaMemcpy(output_data,d.d_output,output_size,cudaMemcpyDeviceToHost);
+	  cudaMemcpy(output_data, d.d_output, output_size, cudaMemcpyDeviceToHost);
+
+	  std::cout << "Output peek " << std::endl;
+	  for(int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) inspectPackedData(output_data[i], i, true);
 	  
 	  fout = fopen("output.dat","ab");
-	  fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);
+	  fwrite((float *)output_data,sizeof(float),NBASE*NCHAN_PER_PACKET*2*2,fout);	  
 	  fclose(fout);
 	}
 	else {
@@ -1200,7 +1235,7 @@ int main (int argc, char *argv[]) {
 	  fwrite((unsigned char *)output_data,sizeof(unsigned char),output_size,fout);
 	  fclose(fout);
 	}
-
+	exit(0);
       }
     }
 
@@ -1221,7 +1256,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -1232,7 +1267,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
diff --git a/src/dsaX_bigfake.c b/legacy/dsaX_bigfake.c
similarity index 100%
rename from src/dsaX_bigfake.c
rename to legacy/dsaX_bigfake.c
diff --git a/src/dsaX_capture.c b/legacy/dsaX_capture.c
similarity index 99%
rename from src/dsaX_capture.c
rename to legacy/dsaX_capture.c
index d83d8a9..054e45d 100644
--- a/src/dsaX_capture.c
+++ b/legacy/dsaX_capture.c
@@ -685,7 +685,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_INFO,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(LOG_INFO,"Created hdu");
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
@@ -861,7 +861,7 @@ int main (int argc, char *argv[]) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
diff --git a/legacy/dsaX_capture.h b/legacy/dsaX_capture.h
new file mode 100644
index 0000000..58355f8
--- /dev/null
+++ b/legacy/dsaX_capture.h
@@ -0,0 +1,131 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#ifndef __DSAX_UDPDB_THREAD_H
+#define __DSAX_UDPDB_THREAD_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+#define DSAX_UDPDB_BUF_CLEAR = 0
+#define DSAX_UDPDB_BUF_FULL = 1
+
+/* socket buffer for receiving udp data */
+typedef struct {
+
+  int           fd;            // FD of the socket
+  size_t        bufsz;         // size of socket buffer
+  char *        buf;          // the socket buffer
+  int           have_packet;   // 
+  size_t        got;           // amount of data received
+
+} dsaX_sock_t;
+
+dsaX_sock_t * dsax_Xnit_sock ();
+
+void dsaX_free_sock(dsaX_sock_t* b);
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  multilog_t *      log;                // DADA logging interface
+  int               verbose;            // verbosity flag 
+
+  dsaX_sock_t *     sock;               // UDP socket for data capture
+  int               port;               // port to receive UDP data 
+  int               control_port;       // port to receive control commands
+  char *            interface;          // IP Address to accept packets on 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+
+  // datablock management
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * block;             // pointer to current datablock buffer
+  uint64_t          block_start_byte;  // seq_byte of first byte for the block
+  uint64_t          block_end_byte;    // seq_byte of first byte of final packet of the block
+  uint64_t          block_count;       // number of packets in this block
+  char            * tblock;            // area of memory to write to
+  
+  // packets
+  unsigned          capture_started;      // flag for start of UDP data
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+
+  uint64_t bytes_to_acquire;
+  double mb_rcv_ps;
+  double mb_drp_ps;
+  double mb_free;
+  double mb_total;
+  uint64_t rcv_sleeps;
+
+  uint64_t last_seq;                     // most recently received seq number
+  uint64_t last_byte;                    // most recently received byte
+  struct   timeval timeout; 
+
+  uint64_t n_sleeps;
+  uint64_t ooo_packets;
+
+  int      recv_core;
+
+} udpdb_t;
+
+
+int dsaX_udpdb_init_receiver (udpdb_t * ctx);
+void dsaX_udpdb_reset_receiver (udpdb_t * ctx);
+int dsaX_udpdb_destroy_receiver (udpdb_t * ctx);
+int dsaX_udpdb_open_buffer (udpdb_t * ctx);
+int dsaX_udpdb_close_buffer (udpdb_t * ctx, uint64_t bytes_written, unsigned eod);
+int dsaX_udpdb_new_buffer (udpdb_t * ctx);
+int dsaX_udpdb_increment (udpdb_t * ctx);
+
+// allocate required resources for data capture
+int dsaX_udpdb_prepare (udpdb_t * ctx);
+
+// move to a state where data acquisition can begin
+time_t dsaX_dpdb_start (udpdb_t * ctx, char * header);
+
+// main workhorse function to receive data for a single observation
+void * dsaX_udpdb_receive_obs (void * ctx);
+
+// close the datablock signifying end of data 
+int udpdb_stop_function (udpdb_t* ctx);
+
+void usage();
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
+
+#endif
diff --git a/src/dsaX_capture_manythread.c b/legacy/dsaX_capture_manythread.c
similarity index 99%
rename from src/dsaX_capture_manythread.c
rename to legacy/dsaX_capture_manythread.c
index 06f508a..b9f14bd 100644
--- a/src/dsaX_capture_manythread.c
+++ b/legacy/dsaX_capture_manythread.c
@@ -427,7 +427,7 @@ void control_thread (void * arg) {
 /* 
  *  Thread to capture data
  */
-void recv_thread(void * arg) {
+int recv_thread(void * arg) {
 
   udpdb_t * udpdb = (udpdb_t *) arg;
   int thread_id = udpdb->thread_id;
@@ -528,7 +528,7 @@ void recv_thread(void * arg) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
@@ -953,7 +953,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(DEBUG,"Created hdu");
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
diff --git a/src/dsaX_capture_manythread.c.bak b/legacy/dsaX_capture_manythread.c.bak
similarity index 100%
rename from src/dsaX_capture_manythread.c.bak
rename to legacy/dsaX_capture_manythread.c.bak
diff --git a/legacy/dsaX_capture_manythread.h b/legacy/dsaX_capture_manythread.h
new file mode 100644
index 0000000..3c96648
--- /dev/null
+++ b/legacy/dsaX_capture_manythread.h
@@ -0,0 +1,119 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#ifndef __DSAX_UDPDB_THREAD_H
+#define __DSAX_UDPDB_THREAD_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+/* socket buffer for receiving udp data */
+// this is initialised in each recv thread
+typedef struct {
+
+  int           fd;            // FD of the socket
+  size_t        bufsz;         // size of socket buffer
+  char *        buf;          // the socket buffer
+  int           have_packet;   // 
+  size_t        got;           // amount of data received
+
+} dsaX_sock_t;
+
+dsaX_sock_t * dsaX_init_sock ();
+void dsaX_free_sock(dsaX_sock_t* b);
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+// structure for write thread
+// tblock must be shared
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * block;             // pointer to current datablock buffer
+  char            * tblock;            // area of memory to write to
+  int               thread_id;
+
+} dsaX_write_t;
+
+// structure for stats thread
+// both are shared between all recv structures and this one
+// last_seq is also shared
+typedef struct {
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t * last_seq;                     // most recently received seq number
+
+} dsaX_stats_t;
+
+
+// structure for receive thread
+// tblock, packets, bytes, last_seq, block_start_byte, block_end_byte, block_count, capture_started
+typedef struct {
+
+  multilog_t *      log;                // DADA logging interface
+  int               verbose;            // verbosity flag 
+
+  int               port;               // port to receive UDP data 
+  int               control_port;       // port to receive control commands
+  char *            interface;          // IP Address to accept packets on 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+
+  // datablock management
+  uint64_t        * block_start_byte;  // seq_byte of first byte for the block
+  uint64_t        * block_end_byte;    // seq_byte of first byte of final packet of the block
+  uint64_t        * block_count;       // number of packets in this block  
+  uint64_t          hdu_bufsz;
+  char            * tblock;            // area of memory to write to
+  
+  // packets
+  unsigned        * capture_started;      // flag for start of UDP data
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t rcv_sleeps;
+
+  uint64_t * last_seq;                     // most recently received seq number
+  struct   timeval timeout;
+  int thread_id;
+
+} udpdb_t;
+
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
+
+#endif
diff --git a/src/dsaX_capture_pcap.c b/legacy/dsaX_capture_pcap.c
similarity index 100%
rename from src/dsaX_capture_pcap.c
rename to legacy/dsaX_capture_pcap.c
diff --git a/legacy/dsaX_capture_pcap.h b/legacy/dsaX_capture_pcap.h
new file mode 100644
index 0000000..f037f75
--- /dev/null
+++ b/legacy/dsaX_capture_pcap.h
@@ -0,0 +1,83 @@
+/***************************************************************************
+ *  
+ *    Copyright (C) 2009 by Andrew Jameson
+ *    Licensed under the Academic Free License version 2.1
+ * 
+ ****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <assert.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "futils.h"
+#include "dada_hdu.h"
+#include "dada_pwc_main.h"
+#include "multilog.h"
+#include "ipcio.h"
+#include "ascii_header.h"
+#include "dada_udp.h"
+
+#include "dsaX_def.h"
+
+/* Number of UDP packets to be recived for a called to buffer_function */
+#define NOTRECORDING 0
+#define RECORDING 1
+
+// structure for all threads
+typedef struct {
+
+  dada_hdu_t *      hdu;                // DADA Header + Data Unit
+  uint64_t          hdu_bufsz;
+  unsigned          block_open;        // if the current data block element is open
+  char            * tblock;  
+  uint64_t          tblock_idx;
+  char            * temp_buffers;
+  uint64_t        * temp_seq_byte;
+  int               temp_idx;
+  int               thread_id;
+  uint64_t          block_start_byte;
+  uint64_t          block_end_byte;
+  uint64_t          block_count;
+  int               nblocks_written;
+  
+  int               verbose;            // verbosity flag 
+
+  // configuration for number of inputs
+  unsigned int      num_inputs;         // number of antennas / inputs
+  
+  // packets
+  uint64_t          packets_per_buffer;   // number of UDP packets per datablock buffer
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+
+  uint64_t last_seq;                     // most recently received seq number
+
+} dsaX_t;
+
+// structure for stats thread
+// both are shared between all recv structures and this one
+// last_seq is also shared
+typedef struct {
+
+  /* Packet and byte statistics */
+  stats_t * packets;
+  stats_t * bytes;
+  uint64_t * last_seq;                     // most recently received seq number
+
+} dsaX_stats_t;
+
+
+void signal_handler (int signalValue); 
+void stats_thread(void * arg);
+void control_thread(void * arg);
diff --git a/src/dsaX_capture_thread.c b/legacy/dsaX_capture_thread.c
similarity index 99%
rename from src/dsaX_capture_thread.c
rename to legacy/dsaX_capture_thread.c
index 3cc0c96..49019be 100644
--- a/src/dsaX_capture_thread.c
+++ b/legacy/dsaX_capture_thread.c
@@ -518,7 +518,7 @@ void control_thread (void * arg) {
 /* 
  *  Thread to capture data
  */
-void recv_thread(void * arg) {
+int recv_thread(void * arg) {
 
   // set affinity
   const pthread_t pid = pthread_self();
@@ -604,7 +604,7 @@ void recv_thread(void * arg) {
 	    } 
 	  else // we received a packet of the WRONG size, ignore it
 	    {
-	      syslog (LOG_NOTICE, "receive_obs: received %d bytes, expected %d", got, UDP_PAYLOAD);
+	      syslog (LOG_NOTICE, "receive_obs: received %lu bytes, expected %d", got, UDP_PAYLOAD);
 	    }
 	}
       timeouts = 0;
@@ -753,7 +753,7 @@ void recv_thread(void * arg) {
 /* 
  *  Thread to write data
  */
-void write_thread(void * arg) {
+int write_thread(void * arg) {
 
   // set affinity
   const pthread_t pid = pthread_self();
@@ -964,7 +964,7 @@ int main (int argc, char *argv[]) {
 
   if (DEBUG) syslog(LOG_DEBUG,"Creating HDU");
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   if (DEBUG) syslog(DEBUG,"Created hdu");
   dada_hdu_set_key (hdu_out, CAPTURE_BLOCK_KEY);
   if (dada_hdu_connect (hdu_out) < 0) {
diff --git a/src/dsaX_copydb.c b/legacy/dsaX_copydb.c
similarity index 97%
rename from src/dsaX_copydb.c
rename to legacy/dsaX_copydb.c
index 054ee94..7714038 100644
--- a/src/dsaX_copydb.c
+++ b/legacy/dsaX_copydb.c
@@ -160,7 +160,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -171,7 +171,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -220,7 +220,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block;
   uint64_t written, block_id;
diff --git a/src/cuda_correlator.cu b/legacy/dsaX_cuda_correlator.cu
similarity index 97%
rename from src/cuda_correlator.cu
rename to legacy/dsaX_cuda_correlator.cu
index eb0882c..3bebd09 100644
--- a/src/cuda_correlator.cu
+++ b/legacy/dsaX_cuda_correlator.cu
@@ -1,6 +1,8 @@
 // -*- c++ -*-
 /* will run xgpu */
 /* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -36,7 +38,7 @@ using std::endl;
 #include "dada_affinity.h"
 #include "ascii_header.h"
 #include "dsaX_def.h"
-#include "cube/cube.h"
+//#include "cube/cube.h"
 #include "xgpu.h"
  
 
@@ -222,7 +224,8 @@ int main(int argc, char** argv) {
 #ifdef RUNTIME_STATS
       clock_gettime(CLOCK_MONOTONIC, &tic);
 #endif
-      xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
+      //xgpu_error = xgpuCudaXengine(&context, array_hd, i==count-1 ? finalSyncOp : syncOp);
+      xgpu_error = xgpuCudaXengine(&context, i==count-1 ? finalSyncOp : syncOp);
 #ifdef RUNTIME_STATS
       clock_gettime(CLOCK_MONOTONIC, &toc);
 #endif
diff --git a/legacy/dsaX_cutlass_interface.cu b/legacy/dsaX_cutlass_interface.cu
new file mode 100644
index 0000000..fc68d55
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.cu
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "dsaX_cutlass_interface.h"
+
+DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
+  problem_size(options.problem_size), batch_count(options.batch_count) {
+
+  // Allocate device memory for batched planar complex GEMM  
+  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  
+  ptr_A_real.reset(batch_count);
+  ptr_A_imag.reset(batch_count);
+  ptr_B_real.reset(batch_count);
+  ptr_B_imag.reset(batch_count);
+  ptr_C_real.reset(batch_count);
+  ptr_C_imag.reset(batch_count);
+  ptr_D_real.reset(batch_count);
+  ptr_D_imag.reset(batch_count);      
+}
+
+// DMH: Replace this with data from DSA-FTD
+void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
+
+  if(testing) {
+    uint64_t seed = 1234;
+    
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+    
+    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  } else {
+    // DMH: construct DSA-FTD interface data transfer interface
+  }
+
+  ptr_A = tensor_A.get();
+  ptr_B = tensor_B.get();
+  ptr_C = tensor_C.get();
+  ptr_D = tensor_D.get();
+  
+  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+  
+  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  
+  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+}
+
+Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
+  
+  Result result;
+  
+  initialize();  
+
+  // Configure pointers in global memory
+  struct {
+    Element *base;
+    void **ptr_real;
+    void **ptr_imag;
+    int64_t batch_stride;
+    int64_t imag_stride;
+  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
+  
+  for (auto const &tensor : tensors) {
+    for (int idx = 0; idx < batch_count; ++idx) {
+      
+      cudaError_t error;
+      void *ptr_real = tensor.base + idx * tensor.batch_stride;
+      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
+      
+      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+    }
+  }
+
+  
+  cudaEvent_t events[2];  
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  
+  // Record an event at the start of a series of GEMM operations
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+
+  // Run profiling loop
+  //-------------------
+  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+  // dispatch routines.
+  //
+  // Note, for planar complex array GEMM kernels, all numeric type arguments 
+  // specify the data type of the base real types. These are understood to
+  // apply to planar complex representations of matrices in memory and to complex<T>
+  // structures for scalars.
+  //
+  // See tools/library/include/cutlass/library/handle.h for more details.
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    
+    result.status = handle.gemm_planar_complex_array(
+	problem_size.m(),                                 // expected GEMM M dimension
+	problem_size.n(),                                 // expected GEMM N dimension
+	problem_size.k(),                                 // expected GEMM K dimension
+	batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+						     );
+    
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+      return result;
+    }
+  }
+  
+  // Record an event when the GEMM operations have been launched.
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  
+  if (handle.get_last_operation()) {
+    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+  }
+
+  // Compute reference in device code
+  if (options.reference_check) {
+    
+    result.passed = true;
+    
+    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+      // Define the GEMM through templates
+      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
+	(problem_size, options.alpha,
+	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+	 cutlass::ComplexTransform::kConjugate,
+	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+	 cutlass::ComplexTransform::kNone,
+	 options.beta,
+	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+	 );
+      
+      Element epsilon = 0.1_hf;
+      Element nonzero_floor = 0.1_hf;
+      
+      result.passed = BlockCompareRelativelyEqual
+	(
+	 tensor_D.get() + idx * batch_stride_D,
+	 tensor_D_ref.get() + idx * batch_stride_D,
+	 batch_stride_D,
+	 epsilon,
+	 nonzero_floor
+	 );
+    }
+    
+    if (result.passed) std::cout << "Reference check passed." << std::endl;
+    else std::cerr << "Error - reference check failed." << std::endl;
+  }
+  
+  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+  std::cout << " GFLOPs: " << result.gflops << std::endl;
+  
+  return result;
+}
+
+ int main(int argc, char const **args) {
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  
+  Options options;  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Compute GEMM
+  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
+  gemm.testing = true;
+  Result result = gemm.run(options);
+  
+  return result.passed ? 0 : -1;
+}
+
diff --git a/legacy/dsaX_cutlass_interface.h b/legacy/dsaX_cutlass_interface.h
new file mode 100644
index 0000000..5aa753e
--- /dev/null
+++ b/legacy/dsaX_cutlass_interface.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  Status status;
+  cudaError_t error;
+  bool passed;
+  
+  Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+  bool help;
+  GemmCoord problem_size;
+  int batch_count;
+  complex<float> alpha;
+  complex<float> beta;
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(false),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    
+    CommandLine cmd(argc, args);
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+    
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "dsaX_cutlass_interface\n\n"
+	<< "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+	<< "Options:\n\n"
+	<< "  --help                      If specified, displays this usage statement.\n\n"
+	<< "  --m=<int>                   GEMM M dimension\n"
+	<< "  --n=<int>                   GEMM N dimension\n"
+	<< "  --k=<int>                   GEMM K dimension\n"
+	<< "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+	<< "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+	<< "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+	<< "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+	<< "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+	<< "  --iterations=<int>          Number of profiling iterations to perform.\n";
+    
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+    
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+  // Half-precision input and output
+  using Element = half_t;
+  
+  // Configurations for layouts and internal computation
+  using LayoutA = ColumnMajor;
+  using LayoutB = ColumnMajor;
+  using LayoutC = ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  Handle handle;
+  
+  GemmCoord problem_size;
+  int batch_count;
+  DeviceAllocation<Element> tensor_A;
+  DeviceAllocation<Element> tensor_B;
+  DeviceAllocation<Element> tensor_C;
+  DeviceAllocation<Element> tensor_D;
+  DeviceAllocation<Element> tensor_D_ref;
+
+  DeviceAllocation<void *> ptr_A_real;
+  DeviceAllocation<void *> ptr_A_imag;
+  DeviceAllocation<void *> ptr_B_real;
+  DeviceAllocation<void *> ptr_B_imag;
+  DeviceAllocation<void *> ptr_C_real;
+  DeviceAllocation<void *> ptr_C_imag;
+  DeviceAllocation<void *> ptr_D_real;
+  DeviceAllocation<void *> ptr_D_imag;
+
+  Element *ptr_A;
+  Element *ptr_B;
+  Element *ptr_C;
+  Element *ptr_D;
+  
+  int64_t batch_stride_A;
+  int64_t batch_stride_B;
+  int64_t batch_stride_C;
+  int64_t batch_stride_D;
+  
+  typename LayoutA::Stride::Index lda;
+  typename LayoutB::Stride::Index ldb;
+  typename LayoutC::Stride::Index ldc;
+  typename LayoutC::Stride::Index ldd;
+  
+  int64_t imag_stride_A;
+  int64_t imag_stride_B;
+  int64_t imag_stride_C;
+  int64_t imag_stride_D;
+  
+public:  
+  // Constructors
+  DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+  DSA_FTD_ComplexGEMM_CUTLASS();
+  
+  // Methods
+  void initialize();  
+  Result run(Options const &options);
+  
+  bool testing;  
+};
+  
diff --git a/src/dsaX_dbnic.c b/legacy/dsaX_dbnic.c
similarity index 98%
rename from src/dsaX_dbnic.c
rename to legacy/dsaX_dbnic.c
index 40407ee..83e3e4a 100644
--- a/src/dsaX_dbnic.c
+++ b/legacy/dsaX_dbnic.c
@@ -261,7 +261,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -294,7 +294,7 @@ int main (int argc, char *argv[]) {
   
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu",block_size);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu",block_size);
   uint64_t  bytes_read = 0;
   char *block;
   uint64_t written, block_id;
diff --git a/src/dsaX_dbnic.c.bak b/legacy/dsaX_dbnic.c.bak
similarity index 100%
rename from src/dsaX_dbnic.c.bak
rename to legacy/dsaX_dbnic.c.bak
diff --git a/src/dsaX_def.h b/legacy/dsaX_def.h
similarity index 100%
rename from src/dsaX_def.h
rename to legacy/dsaX_def.h
diff --git a/src/dsaX_fake.c b/legacy/dsaX_fake.c
similarity index 96%
rename from src/dsaX_fake.c
rename to legacy/dsaX_fake.c
index e68f19a..662ea37 100644
--- a/src/dsaX_fake.c
+++ b/legacy/dsaX_fake.c
@@ -175,7 +175,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -186,7 +186,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -235,7 +235,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t npackets = block_out / 4608;
   char * block, * output_buffer;
@@ -257,7 +257,7 @@ int main (int argc, char *argv[]) {
       fread(packet,4608,1,fin);
       fclose(fin);
 
-      syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      syslog(LOG_INFO,"Read packet, npackets %lu",npackets);
       
       for (int i=0;i<npackets;i++)
 	memcpy(output_buffer+i*4608,packet,4608);
diff --git a/src/dsaX_filTrigger.c b/legacy/dsaX_filTrigger.c
similarity index 95%
rename from src/dsaX_filTrigger.c
rename to legacy/dsaX_filTrigger.c
index d7fa9be..55f95fd 100644
--- a/src/dsaX_filTrigger.c
+++ b/legacy/dsaX_filTrigger.c
@@ -188,11 +188,11 @@ void control_thread (void * arg) {
       //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
       specnum = tmps/4;
       strcpy(footer_buf,tnam);
-      syslog(LOG_INFO, "control_thread: received command to dump at %llu src %s",specnum,footer_buf);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu src %s",specnum,footer_buf);
     }
 	
     if (dump_pending) {
-      syslog(LOG_ERR, "control_thread: BACKED UP - using %llu src %s as next specnum",tmps,tnam);
+      syslog(LOG_ERR, "control_thread: BACKED UP - using %lu src %s as next specnum",tmps,tnam);
       next_specnum = tmps/4;
       strcpy(next_footer_buf,tnam);
     }
@@ -335,7 +335,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DBs
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer");
@@ -495,9 +495,9 @@ int main (int argc, char *argv[]) {
 	  
 	}
 	
-	syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
 	ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
 	fclose(ofile);
 	
 	dumpnum++;
@@ -519,7 +519,7 @@ int main (int argc, char *argv[]) {
       
       // if trigger arrived too late
       if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum);
+	syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
 	
 	bytes_copied=0;
 	dump_pending=0;
@@ -530,14 +530,14 @@ int main (int argc, char *argv[]) {
     }
     
     // update current spec
-    if (DEBUG) syslog(LOG_INFO,"current_specnum %llu",current_specnum);
+    if (DEBUG) syslog(LOG_INFO,"current_specnum %lu",current_specnum);
     current_specnum += specs_per_block;
     
     
     // for exiting
     if (bytes_read < block_size) {
       observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size);
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
     }
     
     // close block for reading
diff --git a/src/dsaX_fluff.c b/legacy/dsaX_fluff.c
similarity index 98%
rename from src/dsaX_fluff.c
rename to legacy/dsaX_fluff.c
index 141bf51..3e3f2d1 100644
--- a/src/dsaX_fluff.c
+++ b/legacy/dsaX_fluff.c
@@ -267,7 +267,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -278,7 +278,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -332,7 +332,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_makeFil.c b/legacy/dsaX_makeFil.c
similarity index 100%
rename from src/dsaX_makeFil.c
rename to legacy/dsaX_makeFil.c
diff --git a/src/dsaX_merge.c b/legacy/dsaX_merge.c
similarity index 98%
rename from src/dsaX_merge.c
rename to legacy/dsaX_merge.c
index 0154b80..7866d5f 100644
--- a/src/dsaX_merge.c
+++ b/legacy/dsaX_merge.c
@@ -255,7 +255,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -266,7 +266,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -277,7 +277,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_in2  = dada_hdu_create ();
+  hdu_in2  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in2, in_key2);
   if (dada_hdu_connect (hdu_in2) < 0) {
     syslog (LOG_ERR,"could not connect to input  buffer2");
@@ -455,7 +455,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block1, * block2, * o1, * o2;
   char * output = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_nicdb.c b/legacy/dsaX_nicdb.c
similarity index 95%
rename from src/dsaX_nicdb.c
rename to legacy/dsaX_nicdb.c
index 65cfdcc..df47ebe 100644
--- a/src/dsaX_nicdb.c
+++ b/legacy/dsaX_nicdb.c
@@ -369,7 +369,7 @@ int main(int argc, char ** argv)
 
   // DADA stuff
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -408,7 +408,7 @@ int main(int argc, char ** argv)
   
   // get block sizes and allocate memory
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have output block sizes %llu\n",block_out);
+  syslog(LOG_INFO, "main: have output block sizes %lu\n",block_out);
   uint64_t  bytes_read = 0;
   char *output1, *output2;
   output1 = (char *)malloc(sizeof(char)*block_out*bdepth);
diff --git a/src/dsaX_nicdb.c.bak b/legacy/dsaX_nicdb.c.bak
similarity index 100%
rename from src/dsaX_nicdb.c.bak
rename to legacy/dsaX_nicdb.c.bak
diff --git a/src/dsaX_reorder.c b/legacy/dsaX_reorder.c
similarity index 98%
rename from src/dsaX_reorder.c
rename to legacy/dsaX_reorder.c
index ed0b440..04955da 100644
--- a/src/dsaX_reorder.c
+++ b/legacy/dsaX_reorder.c
@@ -369,7 +369,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -380,7 +380,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -435,7 +435,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_reorder_raw.c b/legacy/dsaX_reorder_raw.c
similarity index 98%
rename from src/dsaX_reorder_raw.c
rename to legacy/dsaX_reorder_raw.c
index d1a7ca3..c0f6b0c 100644
--- a/src/dsaX_reorder_raw.c
+++ b/legacy/dsaX_reorder_raw.c
@@ -28,6 +28,9 @@
 #include "dada_def.h"
 #include "dada_hdu.h"
 #include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
 #include "ipcbuf.h"
 #include "dada_affinity.h"
 #include "ascii_header.h"
@@ -391,7 +394,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -402,7 +405,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -414,7 +417,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -501,7 +504,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_reorder_raw.c.bak b/legacy/dsaX_reorder_raw.c.bak
similarity index 100%
rename from src/dsaX_reorder_raw.c.bak
rename to legacy/dsaX_reorder_raw.c.bak
diff --git a/src/dsaX_reorder_raw.c.bak2 b/legacy/dsaX_reorder_raw.c.bak2
similarity index 100%
rename from src/dsaX_reorder_raw.c.bak2
rename to legacy/dsaX_reorder_raw.c.bak2
diff --git a/src/dsaX_simplesplit.c b/legacy/dsaX_simplesplit.c
similarity index 97%
rename from src/dsaX_simplesplit.c
rename to legacy/dsaX_simplesplit.c
index fb41432..7a80c7e 100644
--- a/src/dsaX_simplesplit.c
+++ b/legacy/dsaX_simplesplit.c
@@ -193,7 +193,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -204,7 +204,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -216,7 +216,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -298,7 +298,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * o1, * o2;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_splice.c b/legacy/dsaX_splice.c
similarity index 100%
rename from src/dsaX_splice.c
rename to legacy/dsaX_splice.c
diff --git a/src/dsaX_split.c b/legacy/dsaX_split.c
similarity index 98%
rename from src/dsaX_split.c
rename to legacy/dsaX_split.c
index d5724cd..1361e86 100644
--- a/src/dsaX_split.c
+++ b/legacy/dsaX_split.c
@@ -135,7 +135,7 @@ void calc_stats(char *input) {
   }
 
   for (int i=0;i<NANT;i++) {
-    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.),sqrt(rmss[2*i+1]/768.));
+    if (STATS) syslog(LOG_INFO,"RMS_ant_2pol %d %g %g",i,sqrt(rmss[2*i]/768.0),sqrt(rmss[2*i+1]/768.0));
   }
 
 }
@@ -345,7 +345,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -356,7 +356,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -368,7 +368,7 @@ int main (int argc, char *argv[]) {
   }
 
   if (bf) {
-    hdu_out2  = dada_hdu_create ();
+    hdu_out2  = dada_hdu_create (0);
     dada_hdu_set_key (hdu_out2, out_key2);
     if (dada_hdu_connect (hdu_out2) < 0) {
       syslog (LOG_ERR,"could not connect to output  buffer2");
@@ -451,7 +451,7 @@ int main (int argc, char *argv[]) {
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
   uint64_t nints = block_size / block_out;
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * o1, * o2;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/dsaX_splitup.c b/legacy/dsaX_splitup.c
similarity index 97%
rename from src/dsaX_splitup.c
rename to legacy/dsaX_splitup.c
index 3a9ab10..32f055d 100644
--- a/src/dsaX_splitup.c
+++ b/legacy/dsaX_splitup.c
@@ -160,7 +160,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -171,7 +171,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -220,7 +220,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t nsplits = block_size/block_out;
   char * block, * output_buffer;
diff --git a/src/dsaX_store.c b/legacy/dsaX_store.c
similarity index 95%
rename from src/dsaX_store.c
rename to legacy/dsaX_store.c
index de53134..849c27c 100644
--- a/src/dsaX_store.c
+++ b/legacy/dsaX_store.c
@@ -112,7 +112,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DB
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to input buffer");
@@ -167,7 +167,7 @@ int main (int argc, char *argv[]) {
   char fnam[100];
   
 
-  syslog(LOG_INFO, "have ngulps %d, blocksize %llu, bout %llu",ngulps,blocksize,bout);
+  syslog(LOG_INFO, "have ngulps %d, blocksize %lu, bout %lu",ngulps,blocksize,bout);
 
   
   // main reading loop
@@ -202,7 +202,7 @@ int main (int argc, char *argv[]) {
     // for exiting
     if (bytes_read < blocksize) {
       observation_complete = 1;
-      syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu", bytes_read, blocksize);
+      syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu", bytes_read, blocksize);
     }
 
     // close block for reading
diff --git a/src/dsaX_testdada.c b/legacy/dsaX_testdada.c
similarity index 99%
rename from src/dsaX_testdada.c
rename to legacy/dsaX_testdada.c
index c12d704..bbe7640 100644
--- a/src/dsaX_testdada.c
+++ b/legacy/dsaX_testdada.c
@@ -114,7 +114,7 @@ int main (int argc, char *argv[]) {
     }
   
   // DADA stuff  
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   dada_hdu_connect (hdu_in);
 
diff --git a/src/dsaX_trigger.c b/legacy/dsaX_trigger.c
similarity index 95%
rename from src/dsaX_trigger.c
rename to legacy/dsaX_trigger.c
index 26342a4..9592389 100644
--- a/src/dsaX_trigger.c
+++ b/legacy/dsaX_trigger.c
@@ -186,11 +186,11 @@ void control_thread (void * arg) {
       //specnum = (uint64_t)(strtoull(buffer,&endptr,0)*16);
       specnum = tmps;
       strcpy(footer_buf,tbuf);
-      syslog(LOG_INFO, "control_thread: received command to dump at %llu",specnum);
+      syslog(LOG_INFO, "control_thread: received command to dump at %lu",specnum);
     }
 	
     if (dump_pending)
-      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %llu",tmps);
+      syslog(LOG_ERR, "control_thread: BACKED UP - CANNOT dump at %lu",tmps);
   
     if (!dump_pending) dump_pending = 1;
     
@@ -341,7 +341,7 @@ int main (int argc, char *argv[]) {
 
   // open connection to the in/read DBs
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer");
@@ -352,7 +352,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output dada buffer");
@@ -525,9 +525,9 @@ int main (int argc, char *argv[]) {
 	  // DO writing using thread
 	  docopy = 1;
 	  
-	  syslog(LOG_INFO, "written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
+	  syslog(LOG_INFO, "written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s", specnum, trignum-1, dumpnum, footer_buf);
 	  ofile = fopen("/home/ubuntu/data/dumps.dat","a");
-	  fprintf(ofile,"written trigger from specnum %llu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
+	  fprintf(ofile,"written trigger from specnum %lu TRIGNUM%d DUMPNUM%d %s\n", specnum, trignum-1, dumpnum, footer_buf);
 	  fclose(ofile);
 	  
 	  dumpnum++;
@@ -539,7 +539,7 @@ int main (int argc, char *argv[]) {
 
 	// if trigger arrived too late
 	if (specnum < current_specnum-specs_per_block && dumping==0 && dump_pending==1) {
-	  syslog(LOG_INFO, "trigger arrived too late: specnum %llu, current_specnum %llu",specnum,current_specnum);
+	  syslog(LOG_INFO, "trigger arrived too late: specnum %lu, current_specnum %lu",specnum,current_specnum);
 
 	  bytes_copied=0;
 	  dump_pending=0;
@@ -550,7 +550,7 @@ int main (int argc, char *argv[]) {
       }
 
       // update current spec
-      syslog(LOG_INFO,"current_specnum %llu",current_specnum);
+      syslog(LOG_INFO,"current_specnum %lu",current_specnum);
       if (block_count < skips) {
 	block_count++;
       }
@@ -561,7 +561,7 @@ int main (int argc, char *argv[]) {
       // for exiting
       if (bytes_read < block_size) {
 	observation_complete = 1;
-	syslog(LOG_INFO, "main: finished, with bytes_read %llu < expected %llu\n", bytes_read, block_size);
+	syslog(LOG_INFO, "main: finished, with bytes_read %lu < expected %lu\n", bytes_read, block_size);
       }
 
       // close block for reading
diff --git a/src/dsaX_wrangle.c b/legacy/dsaX_wrangle.c
similarity index 98%
rename from src/dsaX_wrangle.c
rename to legacy/dsaX_wrangle.c
index 5825ec6..19507d4 100644
--- a/src/dsaX_wrangle.c
+++ b/legacy/dsaX_wrangle.c
@@ -217,7 +217,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -228,7 +228,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -277,7 +277,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block;
   uint64_t written, block_id;
diff --git a/src/dsaX_wrangleAndWrite.c b/legacy/dsaX_wrangleAndWrite.c
similarity index 100%
rename from src/dsaX_wrangleAndWrite.c
rename to legacy/dsaX_wrangleAndWrite.c
diff --git a/src/dsaX_writeFil.c b/legacy/dsaX_writeFil.c
similarity index 100%
rename from src/dsaX_writeFil.c
rename to legacy/dsaX_writeFil.c
diff --git a/src/dsaX_writevis.c b/legacy/dsaX_writevis.c
similarity index 100%
rename from src/dsaX_writevis.c
rename to legacy/dsaX_writevis.c
diff --git a/src/dsaX_xgpu.cu b/legacy/dsaX_xgpu.cu
similarity index 96%
rename from src/dsaX_xgpu.cu
rename to legacy/dsaX_xgpu.cu
index a64217b..d065848 100644
--- a/src/dsaX_xgpu.cu
+++ b/legacy/dsaX_xgpu.cu
@@ -1,6 +1,8 @@
 // -*- c++ -*-
 /* will run xgpu */
 /* assumes input block size is appropriate */
+#define THRUST_IGNORE_CUB_VERSION_CHECK
+
 #include <iostream>
 #include <algorithm>
 using std::cout;
@@ -177,7 +179,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -188,7 +190,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
   
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -283,7 +285,8 @@ int main (int argc, char *argv[]) {
 
     cudaMemcpy(d_din, tmp_data, context.array_len*sizeof(char),cudaMemcpyHostToDevice);
     promoter<<<6291456,32>>>(d_din,d_dout);
-    xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+    //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+    xgpu_error = xgpuCudaXengine(&context, syncOp);
     xgpuClearDeviceIntegrationBuffer(&context);
 
   }
@@ -315,7 +318,8 @@ int main (int argc, char *argv[]) {
       cudaDeviceSynchronize();
     
       // run xgpu
-      xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+      //xgpu_error = xgpuCudaXengine(&context, (ComplexInput *)d_dout, syncOp);
+      xgpu_error = xgpuCudaXengine(&context, syncOp);
       if(xgpu_error) {
 	syslog(LOG_ERR, "xGPU error %d\n", xgpu_error);
 	return EXIT_FAILURE;
diff --git a/src/dumpfil.c b/legacy/dumpfil.c
similarity index 98%
rename from src/dumpfil.c
rename to legacy/dumpfil.c
index 0e658a5..0be913c 100644
--- a/src/dumpfil.c
+++ b/legacy/dumpfil.c
@@ -202,7 +202,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -236,7 +236,7 @@ int main (int argc, char *argv[]) {
   
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
-  syslog(LOG_INFO, "main: have input block size %llu\n",block_size);
+  syslog(LOG_INFO, "main: have input block size %lu\n",block_size);
   uint64_t  bytes_read = 0;
   uint64_t npackets = 1;
   char * block, * output_buffer;
diff --git a/src/fil2dada.c b/legacy/fil2dada.c
similarity index 95%
rename from src/fil2dada.c
rename to legacy/fil2dada.c
index c2235ec..c49f2b5 100644
--- a/src/fil2dada.c
+++ b/legacy/fil2dada.c
@@ -94,7 +94,9 @@ void get_string(FILE *inputfile, int *nbytes, char string[])
 }
 */
 
-/*int read_header(FILE *inputfile)
+int read_header(FILE *inputfile);
+/*
+int read_header(FILE *inputfile)
 {
   size_t nRead;
   char string[80], message[80];
@@ -353,7 +355,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -364,7 +366,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -413,7 +415,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   uint64_t npackets = 1;
   char * block, * output_buffer;
@@ -431,17 +433,19 @@ int main (int argc, char *argv[]) {
       syslog(LOG_ERR, "cannot open file - will write zeros");
     }
     else {
-		
-      if (rhead) read_header(fin);
-//		fread(packet,block_out,1,fin);
-//		fclose(fin);
 
-//		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
       
-//      for (int i=0;i<npackets;i++)
-//		memcpy(output_buffer,packet,block_out);
-
-//		syslog(LOG_INFO, "Using input packet");
+      //		fread(packet,block_out,1,fin);
+      //		fclose(fin);
+      
+      //		syslog(LOG_INFO,"Read packet, npackets %llu",npackets);
+      
+      //      for (int i=0;i<npackets;i++)
+      //		memcpy(output_buffer,packet,block_out);
+      
+      //		syslog(LOG_INFO, "Using input packet");
       
     }
 
@@ -470,7 +474,8 @@ int main (int argc, char *argv[]) {
     else{
       fclose(fin);
       fin=fopen(fnam,"rb");
-      if (rhead) read_header(fin);
+      // DMH: FIXME
+      //if (rhead) read_header(fin);
       fread(packet,block_out,1,fin);
     }
 
diff --git a/src/flagger.c b/legacy/flagger.c
similarity index 100%
rename from src/flagger.c
rename to legacy/flagger.c
diff --git a/src/gpu_flagger.cu b/legacy/gpu_flagger.cu
similarity index 100%
rename from src/gpu_flagger.cu
rename to legacy/gpu_flagger.cu
diff --git a/legacy/planar_complex.cu b/legacy/planar_complex.cu
new file mode 100644
index 0000000..3fb8175
--- /dev/null
+++ b/legacy/planar_complex.cu
@@ -0,0 +1,87 @@
+/*
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/core_io.h>
+
+int main() {
+
+  cutlass::half_t x = 2.25_hf;
+
+  std::cout << x << std::endl;
+
+  return 0;
+}
+*/
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/util/host_tensor.h>
+
+int main() {
+
+  // Define the GEMM operation
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,                           // ElementA
+    cutlass::layout::ColumnMajor,              // LayoutA
+    cutlass::half_t,                           // ElementB
+    cutlass::layout::ColumnMajor,              // LayoutB
+    cutlass::half_t,                           // ElementOutput
+    cutlass::layout::ColumnMajor,              // LayoutOutput
+    float,                                     // ElementAccumulator
+    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
+  >;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
+
+  cutlass::half_t const *ptrA = A.device_data();
+  cutlass::half_t const *ptrB = B.device_data();
+  cutlass::half_t const *ptrC = C.device_data();
+  cutlass::half_t       *ptrD = C.device_data();
+
+  int lda = A.device_ref().stride(0);
+  int ldb = B.device_ref().stride(0);
+  int ldc = C.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
+  //
+  // Launch GEMM on the device
+  //
+ 
+  status = gemm_op({
+    {M, N, K},
+    {ptrA, lda},            // TensorRef to A device tensor
+    {ptrB, ldb},            // TensorRef to B device tensor
+    {ptrC, ldc},            // TensorRef to C device tensor
+    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
+    {alpha, beta}           // epilogue operation arguments
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  } else {
+    std::cout << "CUTLASS Success! " << std::endl;
+  }
+  
+  return 0;
+}
diff --git a/src/spectrometer_header.txt b/legacy/spectrometer_header.txt
similarity index 100%
rename from src/spectrometer_header.txt
rename to legacy/spectrometer_header.txt
diff --git a/src/splice_offline_beams.c b/legacy/splice_offline_beams.c
similarity index 100%
rename from src/splice_offline_beams.c
rename to legacy/splice_offline_beams.c
diff --git a/src/test_read.c b/legacy/test_read.c
similarity index 99%
rename from src/test_read.c
rename to legacy/test_read.c
index 0eefdc2..2b5730a 100644
--- a/src/test_read.c
+++ b/legacy/test_read.c
@@ -204,7 +204,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
diff --git a/src/test_write.c b/legacy/test_write.c
similarity index 97%
rename from src/test_write.c
rename to legacy/test_write.c
index b74e66b..32dd25d 100644
--- a/src/test_write.c
+++ b/legacy/test_write.c
@@ -28,6 +28,9 @@
 #include "dada_def.h"
 #include "dada_hdu.h"
 #include "ipcio.h"
+// Forward declaration to keep compiler happy
+// Possible minor bug in PSRDada
+int ipcio_check_pending_sod (ipcio_t* );
 #include "ipcbuf.h"
 #include "dada_affinity.h"
 #include "ascii_header.h"
@@ -261,7 +264,7 @@ int main (int argc, char *argv[]) {
   
   syslog (LOG_INFO, "creating in and out hdus");
   
-  hdu_in  = dada_hdu_create ();
+  hdu_in  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_in, in_key);
   if (dada_hdu_connect (hdu_in) < 0) {
     syslog (LOG_ERR,"could not connect to dada buffer in");
@@ -272,7 +275,7 @@ int main (int argc, char *argv[]) {
     return EXIT_FAILURE;
   }
 
-  hdu_out  = dada_hdu_create ();
+  hdu_out  = dada_hdu_create (0);
   dada_hdu_set_key (hdu_out, out_key);
   if (dada_hdu_connect (hdu_out) < 0) {
     syslog (LOG_ERR,"could not connect to output  buffer");
@@ -358,7 +361,7 @@ int main (int argc, char *argv[]) {
   // get block sizes and allocate memory
   uint64_t block_size = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_in->data_block);
   uint64_t block_out = ipcbuf_get_bufsz ((ipcbuf_t *) hdu_out->data_block);
-  syslog(LOG_INFO, "main: have input and output block sizes %llu %llu\n",block_size,block_out);
+  syslog(LOG_INFO, "main: have input and output block sizes %lu %lu\n",block_size,block_out);
   uint64_t  bytes_read = 0;
   char * block, * output_buffer, * blockie;
   output_buffer = (char *)malloc(sizeof(char)*block_out);
diff --git a/src/10_planar_complex.cu b/src/10_planar_complex.cu
new file mode 100644
index 0000000..9e0915d
--- /dev/null
+++ b/src/10_planar_complex.cu
@@ -0,0 +1,567 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex GEMM
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels supporting
+  the batched strided mode.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  The CUTLASS Library contains many kernel instances specialized for architecture, data type, tile
+  size, and alignment. This can result in long compile times.
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+    
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+  	  -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_f16*cn
+
+    $ make 10_planar_complex
+
+    $ ./examples/10_planar_complex/10_planar_complex --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "10_planar_complex example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/10_planar_complex/10_planar_complex  --batch=7 --m=1024 --n=512 --k=1024 \\\n"
+      << "     --alpha=2 --alpha_i=-2 --beta=0.707 --beta_i=-.707\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  using ElementA = cutlass::half_t;
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using ElementB = cutlass::half_t;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using ElementC = cutlass::half_t;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<ElementA> tensor_A;
+  cutlass::DeviceAllocation<ElementB> tensor_B;
+  cutlass::DeviceAllocation<ElementC> tensor_C;
+  cutlass::DeviceAllocation<ElementC> tensor_D;
+  cutlass::DeviceAllocation<ElementC> tensor_D_ref;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched strided GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, ElementA(scope_max), ElementA(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, ElementB(scope_max), ElementB(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, ElementC(scope_max), ElementC(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    ElementA *ptr_A = tensor_A.get();
+    ElementB *ptr_B = tensor_B.get();
+    ElementC *ptr_C = tensor_C.get();
+    ElementC *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMMs
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex(
+        problem_size.m(),                                 // GEMM M dimension
+        problem_size.n(),                                 // GEMM N dimension
+        problem_size.k(),                                 // GEMM K dimension
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+        ptr_A,                                            // Pointer to real part of A matrix
+        ptr_A + imag_stride_A,                            // Pointer to imaginary part of A matrix
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+        ptr_B,                                            // Pointer to real part of B matrix
+        ptr_B + imag_stride_B,                            // Pointer to imaginary part of B matrix
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C,                                            // Pointer to real part of C matrix
+        ptr_C + imag_stride_C,                            // Pointer to imaginary part of C matrix
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D,                                            // Pointer to real part of D matrix
+        ptr_D + imag_stride_D,                            // Pointer to imaginary part of D matrix
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd,                                              // Leading dimension of imaginary part of D matrix
+
+        batch_count,                                      // Number of batched elements
+
+        batch_stride_A,                                   // Stride between batches of real parts of A matrix
+        batch_stride_A,                                   // Stride between batches of imaginary parts of A matrix
+
+        batch_stride_B,                                   // Stride between batches of real parts of B matrix
+        batch_stride_B,                                   // Stride between batches of imaginary parts of B matrix
+
+        batch_stride_C,                                   // Stride between batches of real parts of C matrix
+        batch_stride_C,                                   // Stride between batches of imaginary parts of C matrix
+
+        batch_stride_D,                                   // Stride between batches of real parts of D matrix
+        batch_stride_D                                    // Stride between batches of imaginary parts of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMMs are complete
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          ElementA, LayoutA,
+          ElementB, LayoutB,
+          ElementC, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+        ElementC epsilon = 0.1_hf;
+        ElementC nonzero_floor = 0.1_hf;
+
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this test passes on older architectures even though its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+    
+      // Returning zero so this test passes on older Toolkits even though its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/src/11_planar_complex_array.cu b/src/11_planar_complex_array.cu
new file mode 100644
index 0000000..94dcc55
--- /dev/null
+++ b/src/11_planar_complex_array.cu
@@ -0,0 +1,627 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Planar Complex Array Example
+
+  This example demonstrates the CUTLASS Library's exposure of planar complex GEMM kernels which
+  execute a batch of matrix products, loading problem sizes and matrix base pointers from arrays
+  in global memory.
+
+  These kernels represent complex matrices by storing the real and imaginary parts of the matrix in
+  disjoint regions in memory. These real-valued matrices are stored using existing cuBLAS layouts
+  as either column-major or row-major layouts with a single leading dimension indicating the stride
+  between columns or rows.
+
+  The CUTLASS Library collects multiple template instantiations in a data structure and offers
+  a BLAS-like dispatch API to invoke the appropriate kernel on the Volta or Turing architectures.
+
+  CUTLASS decouples matrix layout from complex transformation, so four possible transformations
+  are possible on the A and B operands:
+
+    n:  column-major
+    c:  column-major complex conjugate
+    t:  row-major
+    h:  row-major complex conjugate
+
+  To build strictly the planar complex kernels needed for general application, execute the following
+  CMake command in an empty build directory.
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex
+
+  This builds all planar complex GEMM variants for Volta and Turing architectures.
+
+  To build strictly the kernels needed for this example, an even narrower filter string may be
+  specified as follows. This only builds planar complex GEMMs targeting Tensor Cores for
+  the 'CN' layout configuration (conjugate A operand with both A and B as column-major).
+
+    $ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" \
+      -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_f16_s*gemm_planar_complex_array_f16*cn
+
+    $ make 11_planar_complex_array
+
+    $ ./examples/11_planar_complex_array/11_planar_complex_array --m=2048 --n=1024 --k=512 --batch=10
+*/
+
+#include <iostream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+
+#include "cutlass/util/reference/device/tensor_fill.h"
+
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+
+#include "cutlass/library/handle.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::complex<float> alpha;
+  cutlass::complex<float> beta;
+
+  bool reference_check;
+  int iterations;
+  
+  Options():
+    help(false),
+    problem_size({1024, 1024, 1024}),
+    batch_count(1),
+    reference_check(true),
+    iterations(20),
+    alpha(1),
+    beta() { }
+
+  bool valid() {
+    return true;
+  }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+    }
+
+    cmd.get_cmd_line_argument("m", problem_size.m());
+    cmd.get_cmd_line_argument("n", problem_size.n());
+    cmd.get_cmd_line_argument("k", problem_size.k());
+    cmd.get_cmd_line_argument("batch", batch_count);
+
+    cmd.get_cmd_line_argument("alpha", alpha.real());
+    cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+    cmd.get_cmd_line_argument("beta", beta.real());
+    cmd.get_cmd_line_argument("beta_i", beta.imag());
+    
+    cmd.get_cmd_line_argument("iterations", iterations);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "11_planar_complex_array example\n\n"
+      << "  This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --m=<int>                   GEMM M dimension\n"
+      << "  --n=<int>                   GEMM N dimension\n"
+      << "  --k=<int>                   GEMM K dimension\n"
+      << "  --batch=<int>               Number of GEMM operations executed in one batch\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (real part)\n"
+      << "  --alpha_i=<f32>             Epilogue scalar alpha (imaginary part)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (real part)\n\n"
+      << "  --beta_i=<f32>              Epilogue scalar beta (imaginary part)\n\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n";
+
+    out << "\n\nExamples:\n\n"
+      << "$ ./examples/11_planar_complex_array/11_planar_complex_array\n\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fmas = problem_size.product() * batch_count * 4;
+    
+    // Two flops per multiply-add
+    return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Performance test environment for planar complex
+class TestbedPlanarComplex {
+public:
+
+  // Half-precision input and output
+  using Element = cutlass::half_t;
+
+  // Configurations for layouts and internal computation
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using ElementCompute = float;
+  using ElementAccumulator = float;
+
+  //
+  // Data members
+  //
+
+  cutlass::library::Handle handle;
+
+  cutlass::gemm::GemmCoord problem_size;
+  int batch_count;
+  cutlass::DeviceAllocation<Element> tensor_A;
+  cutlass::DeviceAllocation<Element> tensor_B;
+  cutlass::DeviceAllocation<Element> tensor_C;
+  cutlass::DeviceAllocation<Element> tensor_D;
+  cutlass::DeviceAllocation<Element> tensor_D_ref;
+
+  cutlass::DeviceAllocation<void *> ptr_A_real;
+  cutlass::DeviceAllocation<void *> ptr_A_imag;
+  cutlass::DeviceAllocation<void *> ptr_B_real;
+  cutlass::DeviceAllocation<void *> ptr_B_imag;
+  cutlass::DeviceAllocation<void *> ptr_C_real;
+  cutlass::DeviceAllocation<void *> ptr_C_imag;
+  cutlass::DeviceAllocation<void *> ptr_D_real;
+  cutlass::DeviceAllocation<void *> ptr_D_imag;
+
+  //
+  // Methods
+  //
+
+  TestbedPlanarComplex(
+    Options const &options
+  ): 
+    problem_size(options.problem_size), batch_count(options.batch_count) {
+
+    // Allocate device memory for batched planar complex GEMM
+    tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+    tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+    tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+    tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+
+    ptr_A_real.reset(batch_count);
+    ptr_A_imag.reset(batch_count);
+    ptr_B_real.reset(batch_count);
+    ptr_B_imag.reset(batch_count);
+    ptr_C_real.reset(batch_count);
+    ptr_C_imag.reset(batch_count);
+    ptr_D_real.reset(batch_count);
+    ptr_D_imag.reset(batch_count);
+
+  }
+
+  void initialize() {
+
+    uint64_t seed = 1073;
+
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+
+    cutlass::reference::device::BlockFillRandomUniform(
+        tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  }
+
+  Result profile(Options const &options) {
+
+    Result result;
+
+    initialize();
+
+    Element *ptr_A = tensor_A.get();
+    Element *ptr_B = tensor_B.get();
+    Element *ptr_C = tensor_C.get();
+    Element *ptr_D = tensor_D.get();
+
+    int64_t batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+    int64_t batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+    int64_t batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+    int64_t batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+
+    typename LayoutA::Stride::Index lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+    typename LayoutB::Stride::Index ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+    typename LayoutC::Stride::Index ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+
+    int64_t imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+    int64_t imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+    int64_t imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+    int64_t imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+    
+    //
+    // Configure pointers in global memory
+    //
+
+    struct {
+      Element *base;
+      void **ptr_real;
+      void **ptr_imag;
+      int64_t batch_stride;
+      int64_t imag_stride;
+    } tensors[] = {
+      { tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+      { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+      { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+      { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}
+    };
+
+    for (auto const &tensor : tensors) {
+      for (int idx = 0; idx < batch_count; ++idx) {
+
+        void *ptr_real = tensor.base + idx * tensor.batch_stride;
+        void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;
+
+        cudaError_t error = cudaMemcpy(
+          tensor.ptr_real + idx,
+          &ptr_real,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+
+        error = cudaMemcpy(
+          tensor.ptr_imag + idx,
+          &ptr_imag,
+          sizeof(void *),
+          cudaMemcpyHostToDevice);
+
+        if (error != cudaSuccess) {
+          throw std::runtime_error("Failed to copy pointer to device memory");
+        }
+      }
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+
+      //
+      // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+      // dispatch routines.
+      //
+      // Note, for planar complex array GEMM kernels, all numeric type arguments 
+      // specify the data type of the base real types. These are understood to
+      // apply to planar complex representations of matrices in memory and to complex<T>
+      // structures for scalars.
+      //
+      // See tools/library/include/cutlass/library/handle.h for more details.
+      //
+
+      result.status = handle.gemm_planar_complex_array(
+
+        problem_size.m(),                                 // expected GEMM M dimension
+        problem_size.n(),                                 // expected GEMM N dimension
+        problem_size.k(),                                 // expected GEMM K dimension
+        batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+      );
+
+      if (result.status != cutlass::Status::kSuccess) {
+        std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+        return result;
+      }
+    }
+    
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    // Cleanup
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    if (handle.get_last_operation()) {
+      std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+    }
+
+    //
+    // Compute reference in device code
+    //
+
+    if (options.reference_check) {
+
+      result.passed = true;
+
+      for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+        cutlass::reference::device::GemmPlanarComplex<
+          Element, LayoutA,
+          Element, LayoutB,
+          Element, LayoutC,
+          ElementAccumulator
+        >(
+          problem_size,
+          options.alpha,
+          {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+          cutlass::ComplexTransform::kConjugate,
+          {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+          cutlass::ComplexTransform::kNone,
+          options.beta,
+          {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+          {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+        );
+
+	Element epsilon = 0.1_hf;
+	Element nonzero_floor = 0.1_hf;
+	
+        result.passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+          tensor_D.get() + idx * batch_stride_D,
+          tensor_D_ref.get() + idx * batch_stride_D,
+          batch_stride_D,
+          epsilon,
+          nonzero_floor
+        );
+      }
+
+      if (result.passed) {
+        std::cout << "Reference check passed." << std::endl;
+      }
+      else {
+        std::cerr << "Error - reference check failed." << std::endl;
+      }
+    }
+
+    std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << " GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+  // Volta Tensor Core operations are first available in CUDA 10.1 Toolkit.
+  //
+  // Turing Tensor Core operations are first available in CUDA 10.2 Toolkit.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (props.major < 7) {
+    std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70."
+              << std::endl;
+
+    // Returning zero so this passes on older architectures. Its actions are no-op.
+    return 0;
+  }
+  else if (props.major == 7 && props.minor <= 2) {
+    //
+    // If running on the Volta architecture, at least CUDA 10.1 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) {
+      std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else if (props.major == 7 && props.minor >= 5) {
+    //
+    // If running on the Turing architecture, at least CUDA 10.2 Toolkit is required to run this example.
+    //
+    if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
+      std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl;
+      
+      // Returning zero so this passes on older Toolkits. Its actions are no-op.
+      return 0;
+    }
+  }
+  else {
+    // NVIDIA Ampere Architecture GPUs (SM80 and later) are fully supported on CUDA 11 Toolkit and beyond.
+    //
+    // fall through
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Execute one problem size
+  if (!options.valid()) {
+    std::cerr << "Invalid problem." << std::endl;
+    return -1;
+  }
+
+  TestbedPlanarComplex testbed(options);
+
+  Result result = testbed.profile(options);
+
+  return result.passed ? 0 : -1;
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..de05a16
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,120 @@
+include_directories(${CMAKE_SOURCE_DIR}/include)
+
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+  add_compile_definitions(DSA_XENGINE_TARGET_CUDA)
+endif()
+
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CPU)
+  add_compile_definitions(DSA_XENGINE_TARGET_CPU)
+endif()
+
+# DSA Fast Time Domain library
+#-----------------------------
+set(DSAX_OBJS
+  cuda_interface.cu
+  cublas_interface.cu
+  malloc.cu
+  dsaX_ptr.cpp
+  cuda_handles.cu
+  magma_interface.cu
+  blas_interface.cpp
+  beamformer.cpp
+  dsaX_base.cpp
+
+  correlator.cpp
+  interface.cpp
+  utils.cpp
+  params.cpp
+  psrdada_utils.cpp
+  )
+
+# split source into cu and cpp files
+foreach(item ${DSAX_OBJS})
+  string(REGEX MATCH ".+\\.cu$" item_match ${item})
+  if(item_match)
+    list(APPEND DSAX_CU_OBJS ${item})
+  endif(item_match)
+endforeach(item ${DSAX_OBJS})
+
+list(REMOVE_ITEM DSAX_OBJS ${DSAX_CU_OBJS})
+
+# DSAX_CU_OBJS should contain all cuda files now and DSAX_OBJS all cpp.
+# If we have a git version, make version.cpp depend on git head so that it is
+# rebuilt if the git sha changed
+if(GITVERSION)
+  find_path(
+    DSAX_GITDIR NAME HEAD
+    PATHS ${CMAKE_SOURCE_DIR}/.git/logs
+    NO_DEFAULT_PATH)
+  include(AddFileDependencies)
+  if(DSAX_GITDIR)
+    add_file_dependencies(version.cpp ${DSAX_GITDIR}/HEAD)
+  endif()
+endif()
+mark_as_advanced(DSAX_GITDIR)
+
+# generate a cmake object library for all cpp files first
+add_library(dsax_cpp OBJECT ${DSAX_OBJS})
+
+if(DSA_XENGINE_BUILD_SHAREDLIB)
+  set_target_properties(dsax_cpp PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
+  add_library(dsaX SHARED)
+else()
+  add_library(dsaX STATIC)
+endif()
+add_library(DSA_XENGINE::dsaX ALIAS dsaX)
+
+# make one library
+target_sources(dsaX PRIVATE $<TARGET_OBJECTS:dsax_cpp> ${DSAX_CU_OBJS})
+
+if(CUDAToolkit_FOUND)
+  target_link_libraries(dsaX INTERFACE CUDA::cuda_driver CUDA::cudart_static ${CUDA_cublas_LIBRARY})
+endif()
+
+if(DSA_XENGINE_ENABLE_PSRDADA)
+  include_directories(${PSRDada_SOURCE_DIR}/src)
+  set(PSRDada_LIB ${PSRDada_BINARY_DIR}/src/libpsrdada.so)
+  target_link_libraries(dsaX PUBLIC ${PSRDada_LIB})
+endif()
+
+if(DSA_XENGINE_ENABLE_XGPU) 
+  include_directories(${xGPU_SOURCE_DIR}/src)
+  set(XGPU_LIB ${xGPU_BINARY_DIR}/src/libxgpu.a)
+  target_link_libraries(dsaX PUBLIC ${XGPU_LIB})
+endif()
+
+if(DSA_XENGINE_ENABLE_CUTLASS) 
+  include_directories(${NvidiaCutlass_DIR}/../../../include)
+  include_directories(${NvidiaCutlass_DIR}/../../../include/cutlass/util)
+  set(NvidiaCutlass_LIB ${NvidiaCutlass_DIR}/../../../lib64/libcutlass.so)
+  target_link_libraries(dsaX PUBLIC ${NvidiaCutlass_LIB})
+  
+  # Some simple CUTLASS examples to test linking/benching
+  #------------------------------------------------------
+  add_executable(planar_complex planar_complex.cu)
+  target_link_libraries(planar_complex ${NvidiaCutlass_LIB})
+  
+  add_executable(10_planar_complex 10_planar_complex.cu)
+  target_link_libraries(10_planar_complex ${NvidiaCutlass_LIB})
+  
+  add_executable(11_planar_complex_array 11_planar_complex_array.cu)
+  target_link_libraries(11_planar_complex_array ${NvidiaCutlass_LIB})
+  #------------------------------------------------------
+
+  # DSA Fast Time Domain CUTLASS interface
+  #---------------------------------------
+  add_executable(dsaX_cutlass_interface dsaX_cutlass_interface.cu)
+  target_link_libraries(dsaX_cutlass_interface ${NvidiaCutlass_LIB})
+  #---------------------------------------  
+endif()
+#---------------------
+
+# install step for libraray
+#-----------------------------
+install(TARGETS
+  # cmake-format: sortable
+  dsaX
+  LIBRARY DESTINATION
+  lib
+  )
+#-----------------------------
diff --git a/src/beamformer.cpp b/src/beamformer.cpp
new file mode 100644
index 0000000..e99a54c
--- /dev/null
+++ b/src/beamformer.cpp
@@ -0,0 +1,120 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+
+#include <iostream>
+#include <vector>
+
+#include "dsaX_def.h"
+#include "dsaX.h"
+#include "blas_interface.h"
+#include "utils.h"
+#include "psrdada_utils.h"
+
+using namespace std;
+
+/*
+Beamformer:
+ - initial data is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex] 
+ - split into EW and NS antennas via cudaMemcpy: [NPACKETS_PER_BLOCK, NANTS/2, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+ - want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
+(single transpose operation)
+ - weights are [NCHAN_PER_PACKET/8, NBEAMS, 4tim, NANTS/2, 8chan, 2 times, 2 pol] x 2
+ - then fluff and run beamformer: output is [NCHAN_PER_PACKET/8, NBEAMS, NPACKETS_PER_BLOCK/4] (w column-major)
+ - transpose and done! 
+
+*/
+// beamformer function
+void dbeamformer(bf_handle *d) {
+
+  dsaXBLASParam blas_param;
+  blas_param.trans_a = DSA_BLAS_OP_T;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NPACKETS_PER_BLOCK/4;
+  blas_param.n = NBEAMS/2;
+  blas_param.k = 4*(NANTS/2)*8*2*2;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.k;
+  blas_param.ldb = blas_param.k;
+  blas_param.beta = 0.0;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = (NPACKETS_PER_BLOCK)*(NANTS/2)*8*2*2;
+  blas_param.b_stride = (NBEAMS/2)*4*(NANTS/2)*8*2*2;
+  blas_param.c_stride = (NPACKETS_PER_BLOCK/4)*NBEAMS/2;
+  blas_param.batch_count = NCHAN_PER_PACKET/8;
+  
+  long long int i1, i2;
+  
+  // timing
+  // copy, prepare, cublas, output
+  clock_t begin, end;
+
+  // do big memcpy
+  begin = clock();
+  dsaXmemcpy(d->d_big_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4, dsaXMemcpyHostToDevice);
+  end = clock();
+  d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+  
+  // loop over halves of the array
+  for (int iArm=0;iArm<2;iArm++) {
+  
+    // zero out output arrays
+    dsaXmemset(d->d_bigbeam_r,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short));
+    dsaXmemset(d->d_bigbeam_i,0,(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*sizeof(short));
+    dsaXDeviceSynchronize();
+    
+    // copy data to device
+    // initial data: [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+    // final data: need to split by NANTS.
+    begin = clock();
+    for (i1=0; i1<NPACKETS_PER_BLOCK; i1++) 
+      dsaXmemcpy(d->d_input + i1*(NANTS/2)*NCHAN_PER_PACKET*4,
+		 d->d_big_input + i1*(NANTS)*NCHAN_PER_PACKET*4+iArm*(NANTS/2)*NCHAN_PER_PACKET*4,
+		 (NANTS/2)*NCHAN_PER_PACKET*4, dsaXMemcpyDeviceToDevice);
+    end = clock();
+    d->cp += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // do reorder and fluff of data to real and imag
+    begin = clock();
+
+    // DMH: Abstract the launch parameters
+    std::vector<int> dimBlock = {16, 8};
+    std::vector<int> dimGrid = {NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16};
+    transposeInputBeamformer((double *)(d->d_input), (double *)(d->d_tx), dimBlock, dimGrid);
+
+    int blocks = NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128;
+    int tpb = 128;
+    fluffInputBeamformer(d->d_tx, d->d_br, d->d_bi, blocks, tpb);    
+    end = clock();
+    d->prep += (float)(end - begin) / CLOCKS_PER_SEC;
+    
+    // set up for gemm    
+    i2 = iArm*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8); // weights offset
+    blas_param.b_offset = i2;
+    // large matrix multiply to get real and imag outputs
+    begin = clock();
+    dsaXHgemmStridedBatched(d->d_br, d->d_bi, d->weights_r, d->weights_i, d->d_bigbeam_r, d->d_bigbeam_i, blas_param);
+    end = clock();
+    d->cubl += (float)(end - begin) / CLOCKS_PER_SEC;
+        
+    // simple formation of total power and scaling to 8-bit in transpose kernel
+    // Reuse dimBlock
+    //DMH: Abstract kernel launch parameters
+    dimGrid[0] = (NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16;
+    dimGrid[1] = (NCHAN_PER_PACKET/8)/16;
+    begin = clock();
+    transposeScaleBeamformer(d->d_bigbeam_r, d->d_bigbeam_i, d->d_bigpower + iArm*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2), dimBlock, dimGrid);
+    end = clock();
+    d->outp += (float)(end - begin) / CLOCKS_PER_SEC;
+  }
+
+  // form sum over times
+  int blocks = 24576;
+  int tpb = 512;
+  // COMMENT OUT WHEN DONE!!!
+  //sumBeam(d->d_bigpower, d->d_chscf, blocks, tpb);
+}
diff --git a/src/blas_interface.cpp b/src/blas_interface.cpp
new file mode 100644
index 0000000..ed76f05
--- /dev/null
+++ b/src/blas_interface.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+
+#include "dsaX.h"
+#include "cublas_interface.h"
+#include "magma_interface.h"
+
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream) {
+  switch (param.blas_lib) {
+  case DSA_BLAS_LIB_CUBLAS:
+    dsaXHgemmStridedBatchedCuda(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
+    break;
+  case DSA_BLAS_LIB_MAGMA:
+    //dsaXHgemmStridedBatchedMagma(real_a, imag_a, real_b, imag_b, real_c, imag_c, param, stream);
+    break;
+  case DSA_BLAS_LIB_CUTLASS:
+    //dsaXHgemmStridedBatchedCutlass(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_OPENBLAS:
+    //dsaXHgemmStridedBatchedOpenblas(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  case DSA_BLAS_LIB_TCC:
+    //dsaXHgemmStridedBatchedTcc(real_a, imag_a, real_b, imag_b, real_c, imag_c, param);
+    break;
+  default:
+    std::cout << "dsaX Error: Unknown blas_lib " << param.blas_lib << " given." << std::endl;
+    exit(0);
+  }
+}
diff --git a/src/correlator.cpp b/src/correlator.cpp
new file mode 100644
index 0000000..2662e58
--- /dev/null
+++ b/src/correlator.cpp
@@ -0,0 +1,285 @@
+// -*- c++ -*-
+/* assumes input and output block size is appropriate - will seg fault otherwise*/
+/*
+Workflow is similar for BF and corr applications
+ - copy data to GPU, convert to half-precision and calibrate while reordering
+ - do matrix operations to populate large output vector
+ */
+
+#include <iostream>
+#include <cstring>
+
+#include "dsaX_def.h"
+#include "dsaX.h"
+#include "fast_time_domain.h"
+#include "blas_interface.h"
+#include "utils.h"
+#include "psrdada_utils.h"
+
+using namespace std;
+
+Correlator::Correlator(const dsaXCorrParam *param) {
+
+  // Transfer passed param to internal objects
+  corr_param = *param;
+  d.corr_param = *param;
+
+  // Select back end BLAS engine 
+  blas_param.struct_size = sizeof(blas_param);
+  blas_param.blas_type = DSA_BLAS_GEMM;
+  blas_param.blas_lib = corr_param.blas_lib;
+
+  // Streams will be class specific
+  // so launch and destroy in the class
+  initStreams(corr_param.n_streams);
+  
+  // Initialise device memeory
+  d.dev_malloc_timer.start();
+  initDsaXCorrDeviceMemory(&d, corr_param.n_streams);
+  d.dev_malloc_timer.stop();
+
+  // Compute indices
+  computeIndices(&d);
+  
+  // gemm settings
+  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
+#if defined OLD_BLAS
+  //cout << "Old params" << endl;  
+  blas_param.data_order = DSA_BLAS_DATAORDER_COL;
+  blas_param.trans_a = DSA_BLAS_OP_A;
+  blas_param.trans_b = DSA_BLAS_OP_T;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#else
+  //cout << "My params" << endl;
+  blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
+  blas_param.trans_a = DSA_BLAS_OP_C;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#endif
+  
+  // Swap A and B if in row order
+  if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
+    swap(blas_param.m, blas_param.n);
+    swap(blas_param.lda, blas_param.ldb);
+    swap(blas_param.trans_a, blas_param.trans_b);
+    swap(blas_param.a_offset, blas_param.b_offset);
+    swap(blas_param.a_stride, blas_param.b_stride);
+    //swap(A_data, B_data);
+    //swap(A_data, B_data);
+  }
+
+  printDsaXBLASParam(blas_param);
+  
+  flops = 8; // 8 complex flops per element
+  flops *= blas_param.m;
+  flops *= blas_param.n;
+  flops *= blas_param.k;
+  flops *= blas_param.batch_count;
+  
+  cout << "Correlator flops = 2*M*N*K * batch = (" << 2 << "*"<< blas_param.m << "*" << blas_param.n << "*" << blas_param.k << "*" << blas_param.batch_count << ") = " << flops << endl;
+  cout << "Correlator Gflop = " << (1e-9)*flops << endl;
+
+  // DMH: reset counters method
+  
+}
+
+Correlator::~Correlator() {
+
+  // Clean up memory
+  destroyDsaXCorrDeviceMemory(&d);
+  destroyStreams();
+  
+  // Transfer metrics to 
+  double device_malloc_time = (1.0*d.dev_malloc_timer.elapsed().count())/(1e6);
+  double host_malloc_time = (1.0*d.host_malloc_timer.elapsed().count())/(1e6);
+  double device_compute_time = (1.0*d.dev_compute_timer.elapsed().count())/(1e6);
+  cout << "Correlator malloc time device  = " << device_malloc_time << " seconds." << endl;
+  cout << "Correlator malloc time host    = " << host_malloc_time << " seconds." << endl;  
+  cout << "Correlator compute time device = " << device_compute_time << " seconds. " << endl;
+  
+  double h2d_time = (1.0*d.H2D_timer.elapsed().count())/(1e6);
+  cout << "Correlator H2D time            = " << h2d_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.H2D_bytes)/pow(1024,3) / h2d_time << " Gbytes/second." << endl;
+  
+  double d2h_time = (1.0*d.D2H_timer.elapsed().count())/(1e6);
+  cout << "Correlator D2H time            = " << d2h_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.D2H_bytes)/pow(1024,3) / d2h_time << " Gbytes/second." << endl;
+
+  double h2h_time = (1.0*d.H2H_timer.elapsed().count())/(1e6);
+  cout << "Correlator H2H time            = " << h2h_time << " seconds. ";
+  cout << "Bandwidth " << (1.0*d.H2H_bytes)/pow(1024,3) / h2h_time << " Gbytes/second." << endl;  
+
+  double total = device_malloc_time + host_malloc_time + device_compute_time + h2d_time + d2h_time;
+  cout << "Correlator TOTAL time          = " << total << " seconds. " << endl;
+  
+  double Tflops = (1.0*d.dev_compute_timer.iterations()*(1e-12*flops)/device_compute_time);
+  cout << "Correlator Tflops              = " << Tflops <<  endl;
+}
+
+void Correlator::compute(void *output, void *input) {
+  
+  uint64_t in_stream_block = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
+  uint64_t out_stream_block = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2;
+
+  unsigned int n_streams = corr_param.n_streams;
+  
+  // Ensure output array is zero
+  dsaXmemset(d.d_output, 0, n_streams * out_stream_block);
+  
+  // Loop over the array in streams for concurrency.
+  for(int i=0; i<n_streams; i++) {
+    // copy to device  
+    dsaXmemcpy(d.d_input + i*in_stream_block, (char*)input + i*in_stream_block, in_stream_block, dsaXMemcpyHostToDeviceAsync, i);
+      
+    // reorder input into real and imaginary planar complex
+    // arrays and, if required, promote to required precision
+    // for consumption by BLAS engine.
+    promoteComplexCharToPlanarHalf(&d, i);
+    //reorderCorrInput(&d, i);
+    
+    // Perform GEMM accoring to back end configuration
+    dsaXHgemmStridedBatched((short*)d.d_r + i*in_stream_block, (short*)d.d_i + i*in_stream_block,
+			    (short*)d.d_r + i*in_stream_block, (short*)d.d_i + i*in_stream_block,
+			    (short*)d.d_outr + i*in_stream_block, (short*)d.d_outi + i*in_stream_block, blas_param, i);
+    
+    // Reorder output data back to interleaved complex
+    // and promote to float
+    reorderCorrOutput(&d, i);
+    
+    // Pass result back to host
+    d.D2H_timer.start();
+    dsaXmemcpy((float*)output + i*out_stream_block, d.d_output + i*out_stream_block, out_stream_block, dsaXMemcpyDeviceToHostAsync, i);
+
+    d.D2H_bytes += out_stream_block;
+    d.D2H_timer.stop();
+  }
+
+  // End loop over stream. Sync to device prior to handing back
+  // scope to client program.
+  dsaXDeviceSynchronize();
+}
+
+/*
+// correlator function
+// workflow: copy to device, reorder, stridedBatchedGemm, reorder, copy back to host
+// DMH: CUDA references excised. Make me a class
+void dcorrelator(corr_handle *d) {
+
+  // zero out output arrays
+  dsaXmemset(d->d_outr, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
+  dsaXmemset(d->d_outi, 0, NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*sizeof(short)); //half -> short
+  dsaXmemset(d->d_output, 0, NCHAN_PER_PACKET*2*NANTS*NANTS*sizeof(float));
+
+  // copy to device
+  dsaXmemcpy(d->d_input, d->h_input, NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2, dsaXMemcpyHostToDevice);
+  
+  // reorder input into real and imaginary arrays of 2 byte data
+  reorderCorrInput(d, 0);
+  
+  dsaXBLASParam blas_param;
+  blas_param.struct_size = sizeof(blas_param);
+  blas_param.blas_type = DSA_BLAS_GEMM;
+
+  // gemm settings
+  // input: [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+  // output: [NCHAN_PER_PACKET, 2times, 2pol, NANTS, NANTS]
+
+#if defined OLD_BLAS
+  //cout << "Old params" << endl;
+  
+  blas_param.data_order = DSA_BLAS_DATAORDER_COL;
+  blas_param.trans_a = DSA_BLAS_OP_A;
+  blas_param.trans_b = DSA_BLAS_OP_T;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#else
+  //cout << "My params" << endl;
+  
+  blas_param.data_order = DSA_BLAS_DATAORDER_ROW;
+  blas_param.trans_a = DSA_BLAS_OP_C;
+  blas_param.trans_b = DSA_BLAS_OP_N;
+  blas_param.m = NANTS;
+  blas_param.n = NANTS;
+  blas_param.k = NPACKETS_PER_BLOCK/halfFac;
+  blas_param.alpha = 1.0;
+  blas_param.lda = blas_param.m;
+  blas_param.ldb = blas_param.n;
+  blas_param.beta = 0.;
+  blas_param.ldc = blas_param.m;
+  blas_param.a_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.b_stride = NPACKETS_PER_BLOCK*NANTS/halfFac;;
+  blas_param.c_stride = NANTS*NANTS;
+  blas_param.batch_count = NCHAN_PER_PACKET*2*2*halfFac;
+  blas_param.a_offset = 0;
+  blas_param.b_offset = 0;
+  blas_param.c_offset = 0;
+#endif
+
+  // Swap A and B if in row order
+  if (blas_param.data_order == DSA_BLAS_DATAORDER_ROW) {
+    swap(blas_param.m, blas_param.n);
+    swap(blas_param.lda, blas_param.ldb);
+    swap(blas_param.trans_a, blas_param.trans_b);
+    swap(blas_param.a_offset, blas_param.b_offset);
+    swap(blas_param.a_stride, blas_param.b_stride);
+    //swap(A_data, B_data);
+    //swap(A_data, B_data);
+  }  
+
+  
+  //printDsaXBLASParam(blas_param);
+  
+  // DMH: fix me
+  blas_param.blas_lib = DSA_BLAS_LIB_CUBLAS;
+  
+  // Perform GEMM accoring to back end configuration
+  dsaXHgemmStridedBatched(d->d_r, d->d_i, d->d_r, d->d_i, d->d_outr, d->d_outi, blas_param);
+
+  //for(int i=0; i<8; i++) inspectPackedData(d.h_input[i], i);
+  
+  // reorder output data
+  reorderCorrOutput(d);
+}
+*/
diff --git a/src/cublas_interface.cu b/src/cublas_interface.cu
new file mode 100644
index 0000000..234e18a
--- /dev/null
+++ b/src/cublas_interface.cu
@@ -0,0 +1,190 @@
+#include <iostream>
+
+#include "dsaX.h"
+#include "params.h"
+#include "cuda_headers.h"
+#include "cuda_handles.h"
+//#include "dsaX_cuda_kernels.h" // For debug
+
+using namespace std;
+
+__global__ void deviceInspectHalf(half *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUBLAS[%d]: device inspect half [%d] =  %f\n", stage, x, __half2float(input[x])); 
+}
+
+void init_cublas_local() {
+  if (!cublas_init) {
+    //cublasError_t error = cudaStreamCreate(streams);
+    cublasStatus_t error = cublasCreate(&cublasH);
+    //cublasSetStream(handle, stream);
+    //cublasStatus_t error = cublasCreate(&handle);
+    if (error != CUBLAS_STATUS_SUCCESS)
+      cout << "cublasCreate failed with error " << error << endl;
+    else
+      cout << "cublasCreated successfully." << endl;
+    cublas_init = true;
+  }
+}
+
+void destroy_cublas_local() {
+  if(cublas_init)
+    cublasDestroy(cublasH);
+  cublas_init = false;
+}
+
+void initBLASCuda() {
+  init_cublas_local();
+}
+
+using namespace std;
+
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param, int stream) {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  
+  // not sure if essential
+  //cudaDeviceSynchronize();
+
+  cublasSetStream(cublasH, get_stream(stream));
+
+  bool verbose = false;
+  
+  // Set up for gemm
+  //----------------
+  // Transfer params
+  const int m = blas_param.m;
+  const int n = blas_param.n;
+  const int k = blas_param.k;
+  const double alpha = blas_param.alpha.real();
+  const int lda = blas_param.lda;
+  const int ldb = blas_param.ldb;
+  const half beta0 = blas_param.beta.real();
+  const half beta1 = 1.0;
+  const int ldc = blas_param.ldc;
+  const long long int a_offset = blas_param.a_offset;
+  const long long int b_offset = blas_param.b_offset;
+  const long long int c_offset = blas_param.c_offset;
+  const long long int strideA = blas_param.a_stride;
+  const long long int strideB = blas_param.b_stride;
+  const long long int strideC = blas_param.c_stride;
+  const int batchCount = blas_param.batch_count;
+
+  // NOTE: cublasHgemm is a real valued kernel. As a result,
+  // matrix conjugates must be handled by passing negative
+  // alpha values on the appropriate imaginary planar
+  // arrays. We discern these negative values while parsing
+  // transpose, adjoint and conjugation values.
+  cublasOperation_t transa;
+  cublasOperation_t transb;
+  int A_imag_alpha_sign = 1.0;
+  switch (blas_param.trans_a) {
+  case DSA_BLAS_OP_N:
+    transa = CUBLAS_OP_N;
+    break;
+  case DSA_BLAS_OP_T:
+    transa = CUBLAS_OP_T;
+    break;
+  case DSA_BLAS_OP_A:
+    transa = CUBLAS_OP_N; 	
+    // A array requests adjoint, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of A.
+    A_imag_alpha_sign *= -1;
+    break;
+  case DSA_BLAS_OP_C:
+    transa = CUBLAS_OP_T; 
+    // A array requests conjugation, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of A.
+    A_imag_alpha_sign *= -1;
+    break;
+  default:
+    std::cout << "Unknown cublas transpose" << std::endl;
+  }
+  
+  int B_imag_alpha_sign = alpha;
+  switch (blas_param.trans_b) {
+  case DSA_BLAS_OP_N:
+    transb = CUBLAS_OP_N;
+    break;
+  case DSA_BLAS_OP_T:
+    transb = CUBLAS_OP_T;
+    break;
+  case DSA_BLAS_OP_A:
+    transb = CUBLAS_OP_N; 	
+    // B array requests adjoint, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of B.
+    B_imag_alpha_sign *= -1;
+    break;
+  case DSA_BLAS_OP_C:
+    transb = CUBLAS_OP_T; 
+    // A array requests conjugation, hence we
+    // must apply supply a factor of -1 to alpha
+    // when dealing with the imaginary component
+    // of B.
+    B_imag_alpha_sign *= -1;
+    break;
+  default:
+    std::cout << "Unknown dsaBLAS transpose" << std::endl;
+  }
+
+  // Run strided batched gemm for datatype 
+  // (a + ib)(c + id) = (ac - bd) + i(bc + ad)
+  // on matrices C = alpha * op(A) * op(B) + beta * C
+  // where op(M) is defined by the transposition variable
+  // cublasOperation_t transM
+
+  //deviceInspectHalf<<<1, 8>>>((half *)real_a);
+  
+  // Accumulate results into C matrix
+  // ac
+  half alpha_ac = alpha;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ac),
+			    (half *)real_a + a_offset, lda, strideA,
+			    (half *)real_b + b_offset, ldb, strideB, &beta0,
+			    (half *)real_c + c_offset, ldc, strideC,
+			    batchCount);
+
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 0);
+  
+  // -bd (minus sign from i*i)
+  half alpha_bd = alpha * (-1.0 * A_imag_alpha_sign * B_imag_alpha_sign);
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bd),
+			    (half*)imag_a + a_offset, lda, strideA,
+			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
+			    (half*)real_c + c_offset, ldc, strideC,
+			    batchCount);
+  
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)real_c, 1);
+  
+  // bc
+  half alpha_bc = alpha * A_imag_alpha_sign;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_bc),
+			    (half*)imag_a + a_offset, lda, strideA,
+			    (half*)real_b + b_offset, ldb, strideB, &beta0,
+			    (half*)imag_c + c_offset, ldc, strideC,
+			    batchCount);
+  
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 2);
+  
+  // ad
+  half alpha_ad = alpha * B_imag_alpha_sign;
+  cublasHgemmStridedBatched(cublasH, transa, transb, m,n,k, &(alpha_ad),
+			    (half*)real_a + a_offset, lda, strideA,
+			    (half*)imag_b + b_offset, ldb, strideB, &beta1,
+			    (half*)imag_c + c_offset, ldc, strideC,
+			    batchCount);
+
+  if(verbose) deviceInspectHalf<<<1, 8>>>((half *)imag_c, 3);
+  
+  // shown to be essential (only with streams, fix me) 
+  //cudaDeviceSynchronize();
+#else
+  std::cout "dsaX not built with CUDA target." << std::endl;
+  exit(0);
+#endif
+}
diff --git a/src/cuda_correlator b/src/cuda_correlator
deleted file mode 100755
index a8b94c7..0000000
Binary files a/src/cuda_correlator and /dev/null differ
diff --git a/src/cuda_handles.cu b/src/cuda_handles.cu
new file mode 100644
index 0000000..1b756d0
--- /dev/null
+++ b/src/cuda_handles.cu
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <utils.h>
+#include <cuda_handles.h>
+
+using namespace std;
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+
+// CUDA stream handler functions
+//-------------------------
+void init_streams(unsigned int n_streams) {
+
+  //if(n_streams < 2 || n_streams > 9) {
+  //cout << "dsaX Error: Must have at least 2 and fewer than 9 streams, requested " << n_streams << endl;
+  //exit(0);
+  //}
+  
+  if(!stream_init) {
+    streams.reserve(n_streams);
+    for (auto &s : streams) cudaStreamCreate(&s);
+    /*
+      int greatestPriority;
+      int leastPriority;
+    
+      // Query the device to get its built in priority range
+      // For CUDA, lower numerical values indicate higher priority
+      cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
+      for (int i=0; i<Nstream-1; i++) {
+      
+      // Set streams 0 to Nstream-1 to have highest priority
+      cudaStreamCreateWithPriority(&streams[i], cudaStreamDefault, greatestPriority);
+      }
+    
+      // Set stream Nstream - 1 to have lowest priority
+      cudaStreamCreateWithPriority(&streams[Nstream - 1], cudaStreamDefault, leastPriority);
+    */    
+    stream_init = true;
+  }
+}
+
+void destroy_streams() {
+  if (stream_init) {
+    for (auto &s : streams) cudaStreamDestroy(s);
+    stream_init = false;
+  } else {
+    cout << "dsaX Warning: streams not initialized. Please call dsaXInitStreams(n) before destroying streams." << endl;
+  }
+}
+
+cudaStream_t get_stream(unsigned int i) {  
+  if(!stream_init) {
+    cout << "dsaX Error: streams not initialized. Please call dsaXInitStreams(n) before getting stream." << endl;
+    exit(0);
+  }
+  return streams[i];
+}
+
+#else
+
+// Empty error out functions if called from non
+// CUDA terget enabled builds
+void init_streams() cout << "dsaX Error: CUDA target not build" << endl; exit(0);
+void destroy_streams() cout << "dsaX Error: CUDA target not build" << endl; exit(0);
+#endif
diff --git a/src/cuda_interface.cu b/src/cuda_interface.cu
new file mode 100644
index 0000000..854b75c
--- /dev/null
+++ b/src/cuda_interface.cu
@@ -0,0 +1,442 @@
+#include <iostream>
+#include <vector>
+
+#include "cuda_headers.h"
+#include "cuda_interface.h"
+#include "cuda_kernels.h"
+#include "cuda_handles.h"
+// DMH: Everything in this file is CUDA aware.
+
+//#include "dsaX_malloc.h"
+#include "dsaX_ptr.h"
+
+using namespace std;
+
+__global__ void deviceInspectHalfCI(half *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUDA_INTERFACE[%d]: device inspect half [%d] =  %f\n", stage, x, __half2float(input[x])); 
+}
+
+__global__ void deviceInspectFloatCI(float *input, int stage) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  printf("CUDA_INTERFACE[%d]: device inspect float [%d] =  %f\n", stage, x, input[x]); 
+}
+
+void dsaXInitCuda(int dev){
+  if(dev >= 0) cudaSetDevice(dev);
+  else {
+    cout << "dsaX Error: invalid device ordinal " << dev << " passed to dsaX." << endl;
+    exit(0);
+  }
+}
+
+void initStreamsCuda(unsigned int n_streams){
+  init_streams(n_streams);
+}
+
+void destroyStreamsCuda(){
+  destroy_streams();
+}
+
+void dsaXDestroyCuda(){
+  cudaDeviceReset();
+}
+
+void *dsaXHostRegisterCuda(size_t size) {
+
+  void *ptr = malloc(size);  
+  cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterDefault);
+  if (err != cudaSuccess) {
+    cout << "dsaX Error: Failed to register pinned memory of size " << size << endl;
+    exit(0);
+  }
+  return ptr;
+}
+
+// allocate device memory
+void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams) {
+
+  // for correlator
+  
+  cudaMalloc((void **)(&d->d_input),   sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  //dsaX_ptr ptr = dsaX_ptr(DSA_MEMORY_DEVICE, sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams, true);
+
+  //cout << &ptr << endl;
+  
+  //d->d_input = 
+    
+  cudaMalloc((void **)(&d->d_r),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  cudaMalloc((void **)(&d->d_i),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  //cudaMalloc((void **)(&d->d_tx),      sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_output),  sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_outr),    sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  cudaMalloc((void **)(&d->d_outi),    sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  //cudaMalloc((void **)(&d->d_tx_outr), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+  //cudaMalloc((void **)(&d->d_tx_outi), sizeof(half)*NCHAN_PER_PACKET*2*2*NANTS*NANTS*halfFac*n_streams);
+
+  // Total device memeory
+  uint64_t mem_size = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams;
+  mem_size += sizeof(int)*NBASE;
+
+  cout << "mem_size = " << mem_size/pow(1024,3) << " GB"  << endl;
+  //exit(0);
+  // DMH: fix me
+  cudaMalloc((void **)(&d->d_idxs), sizeof(int)*NBASE);
+}
+
+void initializeBFCudaMemory(bf_handle *d, int n_streams) {
+  
+  // for beamformer
+  cudaMalloc((void **)(&d->d_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_big_input), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_tx), sizeof(char)*(NPACKETS_PER_BLOCK)*(NANTS/2)*NCHAN_PER_PACKET*2*2*n_streams);
+  cudaMalloc((void **)(&d->d_br), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams);
+  cudaMalloc((void **)(&d->d_bi), sizeof(half)*NCHAN_PER_PACKET*2*(NANTS/2)*(NPACKETS_PER_BLOCK)*2*n_streams);
+  cudaMalloc((void **)(&d->weights_r), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams);
+  cudaMalloc((void **)(&d->weights_i), sizeof(half)*2*4*(NANTS/2)*8*2*2*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams);
+  cudaMalloc((void **)(&d->d_bigbeam_r), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams);
+  cudaMalloc((void **)(&d->d_bigbeam_i), sizeof(half)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*n_streams);
+  cudaMalloc((void **)(&d->d_bigpower), sizeof(unsigned char)*(NPACKETS_PER_BLOCK/4)*(NCHAN_PER_PACKET/8)*(NBEAMS)*n_streams);
+  cudaMalloc((void **)(&d->d_scf), sizeof(float)*(NBEAMS/2)*n_streams); // beam scale factor
+  cudaMalloc((void **)(&d->d_chscf), sizeof(float)*(NBEAMS/2)*(NCHAN_PER_PACKET/8)*n_streams); // beam scale factor
+  
+  // input weights: first is [NANTS, E/N], then [NANTS, 48, 2pol, R/I]
+  d->h_winp = (float *)malloc(sizeof(float)*(NANTS*2+NANTS*(NCHAN_PER_PACKET/8)*2*2));
+  d->flagants = (int *)malloc(sizeof(int)*NANTS);
+  d->h_freqs = (float *)malloc(sizeof(float)*(NCHAN_PER_PACKET/8));
+  cudaMalloc((void **)(&d->d_freqs), sizeof(float)*(NCHAN_PER_PACKET/8));
+  
+  // timers
+  d->cp = 0.;
+  d->prep = 0.;
+  d->outp = 0.;
+  d->cubl = 0.;
+}
+
+// deallocate device memory
+void deallocateCorrCudaMemory(corr_handle *d) {
+  
+  cudaFree(d->d_input);
+  cudaFree(d->d_r);
+  cudaFree(d->d_i);
+  cudaFree(d->d_tx);
+  cudaFree(d->d_output);
+  cudaFree(d->d_outr);
+  cudaFree(d->d_outi);
+  cudaFree(d->d_tx_outr);
+  cudaFree(d->d_tx_outi);
+  cudaFree(d->d_idxs);
+}
+
+// deallocate device memory
+void deallocateBFCudaMemory(bf_handle *d) {
+
+  cudaFree(d->d_input);
+  cudaFree(d->d_tx);
+  cudaFree(d->d_br);
+  cudaFree(d->d_bi);
+  cudaFree(d->weights_r);
+  cudaFree(d->weights_i);
+  cudaFree(d->d_bigbeam_r);
+  cudaFree(d->d_bigbeam_i);
+  cudaFree(d->d_bigpower);
+  cudaFree(d->d_scf);
+  cudaFree(d->d_chscf);
+  free(d->h_winp);
+  free(d->flagants);
+  cudaFree(d->d_freqs);
+  free(d->h_freqs);
+}  
+
+void computeIndicesCuda(corr_handle *d) {
+  
+  // now run kernel to sum into output
+  int *h_idxs = (int *)malloc(sizeof(int)*NBASE);
+  int ii = 0;
+  // upper triangular order (column major) to match xGPU (not the same as CASA!)
+  for (int i=0; i<NANTS; i++) {
+    for (int j=0; j<=i; j++) {
+      h_idxs[ii] = i*NANTS + j;
+      ii++;
+    }
+  }
+  cudaMemcpy(d->d_idxs, h_idxs, sizeof(int)*NBASE, cudaMemcpyHostToDevice);
+  free(h_idxs);
+}
+
+
+// function to copy d_outr and d_outi to d_output
+// inputs are [NCHAN_PER_PACKET, 2 time, 2 pol, NANTS, NANTS]
+// the corr matrices are column major order
+// output needs to be [NBASE, NCHAN_PER_PACKET, 2 pol, 2 complex]
+// start with transpose to get [NANTS*NANTS, NCHAN_PER_PACKET*2*2], then sum into output using kernel
+void reorderCorrOutputCuda(corr_handle *d, int stream) {
+
+  cudaStream_t str = get_stream(stream);
+
+  uint64_t input_offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+  uint64_t output_offset = sizeof(float)*NBASE*NCHAN_PER_PACKET*2*2 * stream;
+  
+  // transpose input data
+#if defined (OLD_BLAS)
+  dim3 dimBlock(32, 8), dimGrid((NANTS*NANTS)/32, (NCHAN_PER_PACKET*2*2*halfFac)/32);
+  transpose_matrix_float<<<dimGrid, dimBlock, 0, str>>>((half*)d->d_outr, (half*)d->d_tx_outr);
+  transpose_matrix_float<<<dimGrid, dimBlock, 0, str>>>((half*)d->d_outi, (half*)d->d_tx_outi);
+#endif
+  
+  // run kernel to finish things
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NCHAN_PER_PACKET*2*NBASE/blockDim;
+#if defined (OLD_BLAS)
+  corr_output_copy<<<blocks, blockDim, 0, str>>>((half*)d->d_tx_outr, (half*)d->d_tx_outi, d->d_output, (int*)d->d_idxs);
+#else
+  corr_output_copy<<<blocks, blockDim, 0, str>>>((half*)d->d_outr + input_offset, (half*)d->d_outi + input_offset, d->d_output + output_offset, (int*)d->d_idxs);
+#endif  
+  //deviceInspectHalfCI<<<1,8>>>((half*)d->d_outi, 0);  
+}
+
+
+
+
+// function to copy and reorder d_input to d_r and d_i
+// input is [NPACKETS_PER_BLOCK, NANTS, NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]
+// output is [NCHAN_PER_PACKET, 2times, 2pol, NPACKETS_PER_BLOCK, NANTS]
+// starts by running transpose on [NPACKETS_PER_BLOCK * NANTS, NCHAN_PER_PACKET * 2 * 2] matrix in doubleComplex form.
+// then fluffs using simple kernel
+void reorderCorrInputCuda(corr_handle *d, int stream) {
+
+  // DMH: globalise me
+  int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+
+  cudaStream_t str = get_stream(stream);
+  
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
+  
+  // transpose input data
+#if defined (OLD_BLAS)  
+  dim3 dimBlock(32, 32), dimGrid((NCHAN_PER_PACKET*2*2)/32, ((NPACKETS_PER_BLOCK)*NANTS)/32);
+
+  transpose_matrix_char<<<dimGrid, dimBlock, 0, str>>>((char*)d->d_input + offset, (char*)d->d_tx + offset);
+
+  // DMH: These two can run concurrently
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_tx + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
+#else
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
+#endif
+}
+
+void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream) {
+
+  // DMH: globalise me
+  int offset = sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2 * stream;
+
+  cudaStream_t str = get_stream(stream);
+  
+  // TUNABLE
+  int blockDim = 128;
+  int blocks = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*4/blockDim;
+
+  promoteComplexCharToPlanarHalf<<<blocks, blockDim, 0, str>>>((char*)d->d_input + offset, (half*)d->d_r + offset, (half*)d->d_i + offset);
+}
+
+// kernels to reorder and fluff input data for beamformer
+// initial data is [NPACKETS_PER_BLOCK, (NANTS/2), NCHAN_PER_PACKET, 2 times, 2 pol, 4-bit complex]            
+// want [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, (NANTS/2), 8chan, 2 times, 2 pol, 4-bit complex]
+// run as 16x16 tiled transpose with 32-byte words 
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid(Width/16, Height/16)
+// here, width=NCHAN_PER_PACKET/8 is the dimension of the fastest input index
+// dim3 dimBlock1(16, 8), dimGrid1(NCHAN_PER_PACKET/8/16, (NPACKETS_PER_BLOCK)*(NANTS/2)/16);
+void transposeInputBeamformerCuda(double *idata, double *odata, std::vector<int> &dim_block_in,
+				  std::vector<int> &dim_grid_in) {
+
+  // Create CUDA objects for launch
+  dim3 dim_block(dim_block_in[0], dim_block_in[1]);
+  dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]);
+
+  // Launch kernel
+  transpose_input_beamformer<<<dim_grid, dim_block>>>(idata, odata);
+}
+
+
+// GPU-powered function to populate weights matrix for beamformer
+// file format:
+// sequential pairs of eastings and northings
+// then [NANTS, 48, R/I] calibs
+
+void calcWeightsCuda(bf_handle *d) {
+
+  // allocate
+  float *antpos_e = (float *)malloc(sizeof(float)*NANTS);
+  float *antpos_n = (float *)malloc(sizeof(float)*NANTS);
+  float *calibs = (float *)malloc(sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+  float *d_antpos_e, *d_antpos_n, *d_calibs;
+  float wnorm;
+  cudaMalloc((void **)(&d_antpos_e), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_antpos_n), sizeof(float)*NANTS);
+  cudaMalloc((void **)(&d_calibs), sizeof(float)*NANTS*(NCHAN_PER_PACKET/8)*2*2);
+
+  // deal with antpos and calibs
+  //int iant;
+  //int found;
+  for (int i=0;i<NANTS;i++) {
+    antpos_e[i] = d->h_winp[2*i];
+    antpos_n[i] = d->h_winp[2*i+1];
+  }
+  for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) {
+
+    // DEBUG CODE?
+    //iant = (int)(i/((NCHAN_PER_PACKET/8)*2));
+    //found = 0;
+    //for (int j=0;j<d->nflags;j++)
+    //if (d->flagants[j]==iant) found = 1;
+
+    calibs[2*i] = d->h_winp[2*NANTS+2*i];
+    calibs[2*i+1] = d->h_winp[2*NANTS+2*i+1];
+
+    wnorm = sqrt(calibs[2*i]*calibs[2*i] + calibs[2*i+1]*calibs[2*i+1]);
+    if (wnorm!=0.0) {
+      calibs[2*i] /= wnorm;
+      calibs[2*i+1] /= wnorm;
+    }
+
+    //if (found==1) {
+    //calibs[2*i] = 0.;
+    //calibs[2*i+1] = 0.;
+    //}
+  }
+
+  //for (int i=0;i<NANTS*(NCHAN_PER_PACKET/8)*2;i++) printf("%f %f\n",calibs[2*i],calibs[2*i+1]);
+  
+  cudaMemcpy(d_antpos_e,antpos_e,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_antpos_n,antpos_n,NANTS*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(d_calibs,calibs,NANTS*(NCHAN_PER_PACKET/8)*2*2*sizeof(float),cudaMemcpyHostToDevice);
+
+  // run kernel to populate weights matrix
+  populate_weights_matrix<<<2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128, 128>>>(d_antpos_e, d_antpos_n, d_calibs, (half*)d->weights_r, (half*)d->weights_i, d->d_freqs);  
+  
+  // free stuff
+  cudaFree(d_antpos_e);
+  cudaFree(d_antpos_n);
+  cudaFree(d_calibs);
+  free(antpos_e);
+  free(antpos_n);
+  free(calibs);
+  
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb) {
+
+  // Launch kernel
+  fluff_input_beamformer<<<blocks, tpb>>>(input, (half*)b_real, (half*)b_imag);  
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+void transposeScaleBeamformerCuda(void *ir, void *ii, unsigned char *odata, std::vector<int> &dim_block_in,
+				  std::vector<int> &dim_grid_in) {
+  
+  // Create CUDA objects for launch
+  dim3 dim_block(dim_block_in[0], dim_block_in[1]);
+  dim3 dim_grid(dim_grid_in[0], dim_grid_in[1]);
+  
+  // Launch kernel
+  transpose_scale_beamformer<<<dim_grid, dim_block>>>((half*)ir, (half*)ii, odata);
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb) {
+
+  // Launch kernel
+  sum_beam<<<blocks,tpb>>>(input, output);  
+}
+
+// CUDA API wrappers
+// DMH: Wrap all these calls around a CHECK_ERROR to save on
+// lines of code
+void dsaXDeviceSynchronizeCuda() {
+
+  cudaError error = cudaSuccess;
+  cudaDeviceSynchronize();
+  if(error != cudaSuccess) {
+    cudaGetLastError();
+    exit(0);
+  }  
+}
+
+void dsaXmemsetCuda(void *array, int ch, size_t n){
+  
+  cudaError error = cudaSuccess;  
+  error = cudaMemset(array, ch, n);
+  if(error != cudaSuccess) {
+    cudaGetLastError();
+    exit(0);
+  }
+  
+}
+
+void dsaXmallocCuda(void *array, size_t array_length){
+
+  // for correlator
+  //cudaMalloc((void **)(&d->d_input),   sizeof(char)*NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2*n_streams);
+  //cudaMalloc((void **)(&d->d_r),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  //cudaMalloc((void **)(&d->d_i),       sizeof(half)*NCHAN_PER_PACKET*2*NANTS*NPACKETS_PER_BLOCK*2*n_streams);
+  
+}
+
+void dsaXmemcpyCuda(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){
+
+  cudaError error = cudaSuccess;
+  cudaStream_t str = get_stream(stream);
+
+  cout << "kind = " << dsaXMemcpyHostToHost << endl;
+  
+  switch(kind) {
+  case dsaXMemcpyHostToHost:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToHost);
+    break;
+  case dsaXMemcpyHostToDevice:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyHostToDevice);
+    break;
+  case dsaXMemcpyDeviceToHost:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToHost);
+    break;
+  case dsaXMemcpyDeviceToDevice:
+    error = cudaMemcpy(array_out, array_in, n, cudaMemcpyDeviceToDevice);
+    break;
+  case dsaXMemcpyHostToHostAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToHost, str);
+    break;
+  case dsaXMemcpyHostToDeviceAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyHostToDevice, str);
+    break;
+  case dsaXMemcpyDeviceToHostAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToHost, str);
+    break;
+  case dsaXMemcpyDeviceToDeviceAsync:
+    error = cudaMemcpyAsync(array_out, array_in, n, cudaMemcpyDeviceToDevice, str);
+    break;
+  default:
+    std::cout << "dsaX error: unknown dsaXMemcpyKind" << std::endl;
+  }
+  
+  if(error != cudaSuccess) {
+    const char *string = cudaGetErrorString(error);
+    //cudaGetLastError();
+    //cudaGetErrorString(&string);
+    printf("dsaXmemcpyCuda failed with error %s\n", string);
+    exit(0);
+  }
+}
diff --git a/src/cutlass_interface.cu b/src/cutlass_interface.cu
new file mode 100644
index 0000000..fc68d55
--- /dev/null
+++ b/src/cutlass_interface.cu
@@ -0,0 +1,315 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "dsaX_cutlass_interface.h"
+
+DSA_FTD_ComplexGEMM_CUTLASS::DSA_FTD_ComplexGEMM_CUTLASS(Options const &options): 
+  problem_size(options.problem_size), batch_count(options.batch_count) {
+
+  // Allocate device memory for batched planar complex GEMM  
+  tensor_A.reset(int64_t(problem_size.m()) * problem_size.k() * batch_count * 2);
+  tensor_B.reset(int64_t(problem_size.k()) * problem_size.n() * batch_count * 2);
+  tensor_C.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  tensor_D_ref.reset(int64_t(problem_size.m()) * problem_size.n() * batch_count * 2);
+  
+  ptr_A_real.reset(batch_count);
+  ptr_A_imag.reset(batch_count);
+  ptr_B_real.reset(batch_count);
+  ptr_B_imag.reset(batch_count);
+  ptr_C_real.reset(batch_count);
+  ptr_C_imag.reset(batch_count);
+  ptr_D_real.reset(batch_count);
+  ptr_D_imag.reset(batch_count);      
+}
+
+// DMH: Replace this with data from DSA-FTD
+void DSA_FTD_ComplexGEMM_CUTLASS::initialize() {
+
+  if(testing) {
+    uint64_t seed = 1234;
+    
+    // Use small integers to simplify correctness checking
+    int scope_max = 6;
+    int scope_min = -6;
+    
+    BlockFillRandomUniform(tensor_A.get(), tensor_A.size(), seed, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_B.get(), tensor_B.size(), seed * 2019, Element(scope_max), Element(scope_min), 0);
+    BlockFillRandomUniform(tensor_C.get(), tensor_C.size(), seed * 2020, Element(scope_max), Element(scope_min), 0);
+  } else {
+    // DMH: construct DSA-FTD interface data transfer interface
+  }
+
+  ptr_A = tensor_A.get();
+  ptr_B = tensor_B.get();
+  ptr_C = tensor_C.get();
+  ptr_D = tensor_D.get();
+  
+  batch_stride_A = int64_t(problem_size.m()) * problem_size.k() * 2;
+  batch_stride_B = int64_t(problem_size.k()) * problem_size.n() * 2;
+  batch_stride_C = int64_t(problem_size.m()) * problem_size.n() * 2;
+  batch_stride_D = int64_t(problem_size.m()) * problem_size.n() * 2;
+  
+  lda = LayoutA::packed({problem_size.m(), problem_size.k()}).stride(0);
+  ldb = LayoutB::packed({problem_size.k(), problem_size.n()}).stride(0);
+  ldc = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  ldd = LayoutC::packed({problem_size.m(), problem_size.n()}).stride(0);
+  
+  imag_stride_A = int64_t(problem_size.m()) * problem_size.k();
+  imag_stride_B = int64_t(problem_size.k()) * problem_size.n();
+  imag_stride_C = int64_t(problem_size.m()) * problem_size.n();
+  imag_stride_D = int64_t(problem_size.m()) * problem_size.n();
+
+}
+
+Result DSA_FTD_ComplexGEMM_CUTLASS::run(Options const &options) {
+  
+  Result result;
+  
+  initialize();  
+
+  // Configure pointers in global memory
+  struct {
+    Element *base;
+    void **ptr_real;
+    void **ptr_imag;
+    int64_t batch_stride;
+    int64_t imag_stride;
+  } tensors[] = {{ tensor_A.get(), ptr_A_real.get(), ptr_A_imag.get(), batch_stride_A, imag_stride_A},
+		 { tensor_B.get(), ptr_B_real.get(), ptr_B_imag.get(), batch_stride_B, imag_stride_B},
+		 { tensor_C.get(), ptr_C_real.get(), ptr_C_imag.get(), batch_stride_C, imag_stride_C},
+		 { tensor_D.get(), ptr_D_real.get(), ptr_D_imag.get(), batch_stride_D, imag_stride_D}};
+  
+  for (auto const &tensor : tensors) {
+    for (int idx = 0; idx < batch_count; ++idx) {
+      
+      cudaError_t error;
+      void *ptr_real = tensor.base + idx * tensor.batch_stride;
+      void *ptr_imag = tensor.base + idx * tensor.batch_stride + tensor.imag_stride;      
+      
+      error = cudaMemcpy(tensor.ptr_real + idx, &ptr_real, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+      error = cudaMemcpy(tensor.ptr_imag + idx, &ptr_imag, sizeof(void *), cudaMemcpyHostToDevice);
+      if (error != cudaSuccess) throw std::runtime_error("Failed to copy pointer to device memory");
+      
+    }
+  }
+
+  
+  cudaEvent_t events[2];  
+  for (auto & event : events) {
+    result.error = cudaEventCreate(&event);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return -1;
+    }
+  }
+  
+  // Record an event at the start of a series of GEMM operations
+  result.error = cudaEventRecord(events[0]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+
+  // Run profiling loop
+  //-------------------
+  // Execute the planar complex array GEMM kernel via the CUTLASS Library's
+  // dispatch routines.
+  //
+  // Note, for planar complex array GEMM kernels, all numeric type arguments 
+  // specify the data type of the base real types. These are understood to
+  // apply to planar complex representations of matrices in memory and to complex<T>
+  // structures for scalars.
+  //
+  // See tools/library/include/cutlass/library/handle.h for more details.
+  //
+  for (int iter = 0; iter < options.iterations; ++iter) {
+    
+    result.status = handle.gemm_planar_complex_array(
+	problem_size.m(),                                 // expected GEMM M dimension
+	problem_size.n(),                                 // expected GEMM N dimension
+	problem_size.k(),                                 // expected GEMM K dimension
+	batch_count,                                      // Number of batched elements
+
+        nullptr,
+        nullptr,
+        nullptr,
+
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued accumulation
+        cutlass::library::NumericTypeID::kF32,            // Base data type of complex-valued alpha/beta scalars
+
+        &options.alpha,                                   // Pointer to alpha scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued A matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of A matrix
+        cutlass::library::ComplexTransform::kConjugate,   // Complex transformation on A matrix operand
+
+        ptr_A_real.get(),                                 // Pointer to array of pointers to real part of A matrix
+        ptr_A_imag.get(),                                 // Pointer to array of pointers to imaginary part of A matrix
+
+        lda,                                              // Leading dimension of real part of A matrix
+        lda,                                              // Leading dimension of imaginary part of A matrix
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex-valued B matrix
+        cutlass::library::LayoutTypeID::kColumnMajor,     // Layout of B matrix
+        cutlass::library::ComplexTransform::kNone,        // Complex transformation on B matrix operand
+
+        ptr_B_real.get(),                                 // Pointer to array of pointers to real part of B matrix
+        ptr_B_imag.get(),                                 // Pointer to array of pointers to imaginary part of B matrix
+
+        ldb,                                              // Leading dimension of real part of B matrix
+        ldb,                                              // Leading dimension of imaginary part of B matrix
+
+        &options.beta,                                    // Pointer to beta scalar, of type complex<T>
+
+        cutlass::library::NumericTypeID::kF16,            // Base data type of complex valued C and D matrices
+
+        ptr_C_real.get(),                                 // Pointer to array of pointers to real part of C matrix
+        ptr_C_imag.get(),                                 // Pointer to array of pointers to imaginary part of C matrix
+
+        ldc,                                              // Leading dimension of real part of C matrix
+        ldc,                                              // Leading dimension of imaginary part of C matrix
+
+        ptr_D_real.get(),                                 // Pointer to array of pointers to real part of D matrix
+        ptr_D_imag.get(),                                 // Pointer to array of pointers to imaginary part of D matrix
+
+        ldd,                                              // Leading dimension of real part of D matrix
+        ldd                                               // Leading dimension of imaginary part of D matrix
+						     );
+    
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "CUTLASS internal error - configuration not supported" << std::endl;
+      return result;
+    }
+  }
+  
+  // Record an event when the GEMM operations have been launched.
+  result.error = cudaEventRecord(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Wait for work on the device to complete.
+  result.error = cudaEventSynchronize(events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Measure elapsed runtime
+  float runtime_ms = 0;
+  result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  if (result.error != cudaSuccess) {
+    std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+    return result;
+  }
+  
+  // Compute average runtime and GFLOPs.
+  result.runtime_ms = double(runtime_ms) / double(options.iterations);
+  result.gflops = options.gflops(result.runtime_ms / 1000.0);
+  
+  // Cleanup
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  
+  if (handle.get_last_operation()) {
+    std::cout << "Recently executed '" << handle.get_last_operation()->description().name << "'" << std::endl;
+  }
+
+  // Compute reference in device code
+  if (options.reference_check) {
+    
+    result.passed = true;
+    
+    for (int64_t idx = 0; result.passed && idx < int64_t(batch_count); ++idx) {
+      // Define the GEMM through templates
+      GemmPlanarComplex<Element, LayoutA, Element, LayoutB, Element, LayoutC, ElementAccumulator>
+	(problem_size, options.alpha,
+	 {tensor_A.get() + idx * batch_stride_A, lda, imag_stride_A},
+	 cutlass::ComplexTransform::kConjugate,
+	 {tensor_B.get() + idx * batch_stride_B, ldb, imag_stride_B},
+	 cutlass::ComplexTransform::kNone,
+	 options.beta,
+	 {tensor_C.get() + idx * batch_stride_C, ldc, imag_stride_C},
+	 {tensor_D_ref.get() + idx * batch_stride_D, ldd, imag_stride_D}
+	 );
+      
+      Element epsilon = 0.1_hf;
+      Element nonzero_floor = 0.1_hf;
+      
+      result.passed = BlockCompareRelativelyEqual
+	(
+	 tensor_D.get() + idx * batch_stride_D,
+	 tensor_D_ref.get() + idx * batch_stride_D,
+	 batch_stride_D,
+	 epsilon,
+	 nonzero_floor
+	 );
+    }
+    
+    if (result.passed) std::cout << "Reference check passed." << std::endl;
+    else std::cerr << "Error - reference check failed." << std::endl;
+  }
+  
+  std::cout << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+  std::cout << " GFLOPs: " << result.gflops << std::endl;
+  
+  return result;
+}
+
+ int main(int argc, char const **args) {
+  cudaDeviceProp props;
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+  
+  Options options;  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  // Compute GEMM
+  DSA_FTD_ComplexGEMM_CUTLASS gemm(options);
+  gemm.testing = true;
+  Result result = gemm.run(options);
+  
+  return result.passed ? 0 : -1;
+}
+
diff --git a/src/dsaX_api.cu b/src/dsaX_api.cu
new file mode 100644
index 0000000..8f26a49
--- /dev/null
+++ b/src/dsaX_api.cu
@@ -0,0 +1,43 @@
+
+
+
+void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file,
+                   const char *line)
+  {
+    if (count == 0) return;
+    QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), device::get_default_stream(), false, func, file, line);
+  }
+
+
+void dsaMemcpyAsync_(void *dst, const void *src, size_t count, dsaMemcpyKind kind, const qudaStream_t &stream,
+		     const char *func, const char *file, const char *line)
+  {
+    if (count == 0) return;
+
+    if (kind == qudaMemcpyDeviceToDevice) {
+      QudaMem copy(dst, src, count, qudaMemcpyKindToAPI(kind), stream, true, func, file, line);
+    } else {
+#ifdef USE_DRIVER_API
+      switch (kind) {
+      case qudaMemcpyDeviceToHost:
+        PROFILE(cuMemcpyDtoHAsync(dst, (CUdeviceptr)src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_D2H_ASYNC);
+        break;
+      case qudaMemcpyHostToDevice:
+        PROFILE(cuMemcpyHtoDAsync((CUdeviceptr)dst, src, count, get_stream(stream)), QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+        break;
+      case qudaMemcpyDeviceToDevice:
+        PROFILE(cuMemcpyDtoDAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)),
+                QUDA_PROFILE_MEMCPY_D2D_ASYNC);
+        break;
+      case qudaMemcpyDefault:
+        PROFILE(cuMemcpyAsync((CUdeviceptr)dst, (CUdeviceptr)src, count, get_stream(stream)),
+                QUDA_PROFILE_MEMCPY_DEFAULT_ASYNC);
+        break;
+      default: errorQuda("Unsupported cuMemcpyTypeAsync %d", kind);
+      }
+#else
+      PROFILE(cudaMemcpyAsync(dst, src, count, qudaMemcpyKindToAPI(kind), get_stream(stream)),
+              kind == qudaMemcpyDeviceToHost ? QUDA_PROFILE_MEMCPY_D2H_ASYNC : QUDA_PROFILE_MEMCPY_H2D_ASYNC);
+#endif
+    }
+  }
diff --git a/src/dsaX_base.cpp b/src/dsaX_base.cpp
new file mode 100644
index 0000000..80a947a
--- /dev/null
+++ b/src/dsaX_base.cpp
@@ -0,0 +1,9 @@
+#include "fast_time_domain.h"
+
+dsaXBase::dsaXBase() {
+  
+}
+
+dsaXBase::~dsaXBase() {
+  
+}
diff --git a/src/dsaX_beamformer_passon b/src/dsaX_beamformer_passon
deleted file mode 100755
index b08ed99..0000000
Binary files a/src/dsaX_beamformer_passon and /dev/null differ
diff --git a/src/dsaX_ptr.cpp b/src/dsaX_ptr.cpp
new file mode 100644
index 0000000..702654d
--- /dev/null
+++ b/src/dsaX_ptr.cpp
@@ -0,0 +1,155 @@
+#include <utility>
+#include "dsaX_ptr.h"
+
+dsaX_ptr::dsaX_ptr(dsaXMemoryType type, size_t size, bool pool) : type(type), size(size), pool(pool) {
+  if (pool && (type != DSA_MEMORY_DEVICE && type != DSA_MEMORY_HOST_PINNED && type != DSA_MEMORY_HOST)) {    
+    printf("dsaX ERROR: Memory pool not available for memory type %d", type);
+    exit(0);
+  }
+  
+  if (size > 0) {
+    switch (type) {
+    case DSA_MEMORY_DEVICE: device = pool ? pool_device_malloc(size) : device_malloc(size); break;
+    case DSA_MEMORY_DEVICE_PINNED: device = device_pinned_malloc(size); break;
+    case DSA_MEMORY_HOST: host = safe_malloc(size); break;
+    case DSA_MEMORY_HOST_PINNED: host = pool ? pool_pinned_malloc(size) : pinned_malloc(size); break;
+    case DSA_MEMORY_MAPPED:
+      host = mapped_malloc(size);
+      device = get_mapped_device_pointer(host);
+      break;
+    case DSA_MEMORY_MANAGED:
+      host = managed_malloc(size);
+      device = host;
+      break;
+    default:
+      printf("dsaX ERROR: Unknown memory type %d", type);
+      exit(0);
+    }
+  }
+}
+
+dsaX_ptr::dsaX_ptr(void *ptr, dsaXMemoryType type) : type(type), reference(true) {
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+    device = ptr;
+    host = nullptr;
+    break;
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED:
+    device = nullptr;
+    host = ptr;
+    break;
+  case DSA_MEMORY_MANAGED:
+    device = ptr;
+    host = ptr;
+    break;
+  default:
+    printf("dsaX ERROR: Unsupported memory type %d", type);
+    exit(0);
+  }
+}
+
+dsaX_ptr &dsaX_ptr::operator=(dsaX_ptr &&other) {
+  if (&other != this) {
+    if (size > 0) {
+      printf("dsaX ERROR: Cannot move to already initialized dsaX_ptr");
+    }
+    type = std::exchange(other.type, DSA_MEMORY_INVALID);
+    size = std::exchange(other.size, 0);
+    pool = std::exchange(other.pool, false);
+    device = std::exchange(other.device, nullptr);
+    host = std::exchange(other.host, nullptr);
+  }
+  return *this;
+}
+
+void dsaX_ptr::destroy() {
+  if (size > 0) {
+    switch (type) {
+    case DSA_MEMORY_DEVICE: pool ? pool_device_free(device) : device_free(device); break;
+    case DSA_MEMORY_DEVICE_PINNED: device_pinned_free(device); break;
+    case DSA_MEMORY_HOST: host_free(host); break;
+    case DSA_MEMORY_HOST_PINNED: pool ? pool_pinned_free(host) : host_free(host); break;
+    case DSA_MEMORY_MAPPED: host_free(host); break;
+    default:
+      printf("Unknown memory type %d", type);
+      exit(0);
+    }
+  }
+
+  size = 0;
+  device = nullptr;
+  host = nullptr;
+}
+
+dsaX_ptr::~dsaX_ptr() {
+  destroy();
+}
+
+void dsaX_ptr::exchange(dsaX_ptr &obj, dsaX_ptr &&new_value) {
+  destroy();
+  *this = std::move(obj);
+  obj = std::move(new_value);
+}
+
+bool dsaX_ptr::is_device() const {
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+  case DSA_MEMORY_MAPPED:
+  case DSA_MEMORY_MANAGED: return true;
+  default: return false;
+  }
+}
+
+bool dsaX_ptr::is_host() const {
+  switch (type) {
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED:
+  case DSA_MEMORY_MANAGED: return true;
+  default: return false;
+  }
+}
+
+void *dsaX_ptr::data() const {
+  void *ptr = nullptr;
+
+  switch (type) {
+  case DSA_MEMORY_DEVICE:
+  case DSA_MEMORY_DEVICE_PINNED:
+  case DSA_MEMORY_MAPPED:
+  case DSA_MEMORY_MANAGED: ptr = device; break;
+  case DSA_MEMORY_HOST:
+  case DSA_MEMORY_HOST_PINNED: ptr = host; break;
+  default:
+    printf("Unknown memory type %d", type);
+    exit(0);
+  }
+
+  return ptr;
+}
+
+void *dsaX_ptr::data_device() const {
+  if (!device) {
+    printf("dsaX ERROR: Device view not defined");
+    exit(0);
+  }
+  return device;
+}
+
+void *dsaX_ptr::data_host() const {
+  if (!host) {
+    printf("dsaX ERROR: Host view not defined");
+    exit(0);
+  }
+  return host;
+}
+
+bool dsaX_ptr::is_reference() const { return reference; }
+
+std::ostream &operator<<(std::ostream &output, const dsaX_ptr &ptr) {
+  output << "{type = " << ptr.type << ", size = " << ptr.size << ", pool = " << ptr.pool
+	 << ", device = " << ptr.device << ", host = " << ptr.host << ", reference = " << ptr.reference << "}";
+  return output;
+}
diff --git a/src/dsaX_wrangle b/src/dsaX_wrangle
deleted file mode 100755
index f839b14..0000000
Binary files a/src/dsaX_wrangle and /dev/null differ
diff --git a/src/interface.cpp b/src/interface.cpp
new file mode 100644
index 0000000..41e7caf
--- /dev/null
+++ b/src/interface.cpp
@@ -0,0 +1,158 @@
+#include <iostream>
+#include <vector>
+#include <cstring>
+#include <string>
+
+#include "params.h"
+#include "cuda_interface.h"
+#include "utils.h"
+#include "fast_time_domain.h"
+
+using namespace std;
+
+using ms = std::chrono::microseconds;
+using hrc = std::chrono::high_resolution_clock;  
+
+timer::Timer<ms, hrc> app_timer;
+timer::Timer<ms, hrc> init_timer;
+
+void dsaXInit(int dev){
+  app_timer.start();
+#if DSA_XENGINE_TARGET_CUDA
+  init_timer.start();
+  dsaXInitCuda(dev);
+  initBLAS();
+  init_timer.stop();
+#endif
+  cout << " --- Starting dsaX with configuration (defined in dsaX_def.h) --- " << endl;
+  cout << "NPACKETS_PER_BLOCK = " << NPACKETS_PER_BLOCK << endl;
+  cout << "NCHAN = " << NCHAN << endl;
+  cout << "NCHAN_PER_PACKET = " << NCHAN_PER_PACKET << endl;
+  cout << "NPOL = " << NPOL << endl;
+  cout << "NARM = " << 2 << endl;
+#if DSA_XENGINE_TARGET_CUDA
+  cout << "CUDA is ENABLED " << endl;
+#else
+  cout << "CUDA is DISABLED " << endl;
+#endif
+  cout << " --- End dsaX configuration --- " << endl;
+  //DMH: Add more (ask Vikram)
+}
+
+void dsaXEnd() {
+  app_timer.stop();
+  // output metrics
+  cout << "dsaX lifetime = " << (1.0*app_timer.elapsed().count())/(1e6) << endl;
+  cout << "dsaX init = " << (1.0*init_timer.elapsed().count())/(1e6) << endl;
+#if DSA_XENGINE_TARGET_CUDA
+  dsaXDestroyCuda();
+#endif
+}
+
+void *dsaXHostRegister(size_t size) {
+#if DSA_XENGINE_TARGET_CUDA  
+  return dsaXHostRegisterCuda(size);
+#endif
+}
+
+void inspectPackedData(char input, int i, bool non_zeros) {
+  float re = (float)((char)((   (unsigned char)(input) & (unsigned char)(15)  ) << 4) >> 4);
+  float im = (float)((char)((   (unsigned char)(input) & (unsigned char)(240))) >> 4);
+  
+  if(non_zeros) {
+    if(re != 0 || im != 0) 
+      cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << endl;
+  } else {
+    cout << "val["<<i<<"] = ("<<re<<","<<im<<")" << endl;
+  }
+}
+
+void promoteComplexCharToPlanarHalf(corr_handle *d, unsigned int stream) {
+#if DSA_XENGINE_TARGET_CUDA
+  promoteComplexCharToPlanarHalfCuda(d, stream);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void reorderCorrInput(corr_handle *d, int stream) {
+#if DSA_XENGINE_TARGET_CUDA
+  reorderCorrInputCuda(d, stream);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void initBLAS() {
+#if DSA_XENGINE_TARGET_CUDA
+  // DMH: Fix me for orther libs
+  initBLASCuda();
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void initStreams(unsigned int n_streams) {
+#if DSA_XENGINE_TARGET_CUDA
+  initStreamsCuda(n_streams);
+#else
+  // NO OP
+#endif
+}
+
+void destroyStreams() {
+#if DSA_XENGINE_TARGET_CUDA
+  destroyStreamsCuda();
+#else
+  // NO OP
+#endif
+}
+
+void computeIndices(corr_handle *d) {
+#if DSA_XENGINE_TARGET_CUDA
+  computeIndicesCuda(d);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+
+void reorderCorrOutput(corr_handle *d, int stream) {
+#if DSA_XENGINE_TARGET_CUDA  
+  reorderCorrOutputCuda(d, stream);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void transposeInputBeamformer(double *input, double *output, vector<int> &dimBlock, vector<int> &dimGrid) {
+#if DSA_XENGINE_TARGET_CUDA
+  transposeInputBeamformerCuda(input, output, dimBlock, dimGrid);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void transposeScaleBeamformer(void *real, void *imag, unsigned char *output, vector<int> &dimBlock, vector<int> &dimGrid) {
+#if DSA_XENGINE_TARGET_CUDA
+  transposeScaleBeamformerCuda(real, imag, output, dimBlock, dimGrid);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void fluffInputBeamformer(char *input, void *array_real, void *array_imag, int blocks, int tpb) {
+#if DSA_XENGINE_TARGET_CUDA
+  fluffInputBeamformerCuda(input, array_real, array_imag, blocks, tpb);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
+
+void sumBeam(unsigned char *input, float *output, int blocks, int tpb) {
+#if DSA_XENGINE_TARGET_CUDA
+  sumBeamCuda(input, output, blocks, tpb);
+#else
+  cout << "dsaX error: not implemented" << endl;
+#endif
+}
diff --git a/src/magma_interface.cu b/src/magma_interface.cu
new file mode 100644
index 0000000..af91a52
--- /dev/null
+++ b/src/magma_interface.cu
@@ -0,0 +1,24 @@
+#include <iostream>
+
+#include "dsaX.h"
+#include "params.h"
+#include "cuda_headers.h"
+#include "magma_headers.h"
+
+using namespace std;
+
+void dsaXHgemmStridedBatchedMagma(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam blas_param) {
+#if defined (DSA_XENGINE_TARGET_CUDA)
+#if defined (DSA_XENGINE_ENABLE_MAGMA)
+
+  // TO DO
+  
+#else
+  std::cout << "dsaX not built with MAGMA. Rebuild with CMake param DSA_XENGINE_ENABLE_MAGMA=ON" << std::endl;
+  exit(0);
+#endif
+#else
+  std::cout << "dsaX not built with CUDA target. Rebuild with CMake param DSA_XENGINE_TARGET_TYPE=CUDA" << std::endl;
+  exit(0);
+#endif
+}
diff --git a/src/malloc.cu b/src/malloc.cu
new file mode 100644
index 0000000..55bad3f
--- /dev/null
+++ b/src/malloc.cu
@@ -0,0 +1,631 @@
+#include "dsaX_malloc.h"
+
+#include "cuda_headers.h"
+//#include "cuda_interface.h"
+//#include "cuda_kernels.h"
+//#include "cuda_handles.h"
+// DMH: Everything in this file is CUDA aware.
+
+enum AllocType { DEVICE, DEVICE_PINNED, HOST, PINNED, MAPPED, MANAGED, SHMEM, N_ALLOC_TYPE };
+
+class MemAlloc
+{
+  
+public:
+  std::string func;
+  std::string file;
+  int line;
+  size_t size;
+  size_t base_size;
+  
+  MemAlloc() : line(-1), size(0), base_size(0) {}
+  
+  MemAlloc(std::string func, std::string file, int line) : func(func), file(file), line(line), size(0), base_size(0)
+  {
+  }
+  
+  MemAlloc(const MemAlloc &) = default;
+  MemAlloc(MemAlloc &&) = default;
+  virtual ~MemAlloc() = default;
+  MemAlloc &operator=(const MemAlloc &) = default;
+  MemAlloc &operator=(MemAlloc &&) = default;
+};
+
+static std::map<void *, MemAlloc> alloc[N_ALLOC_TYPE];
+static size_t total_bytes[N_ALLOC_TYPE] = {0};
+static size_t max_total_bytes[N_ALLOC_TYPE] = {0};
+static size_t total_host_bytes, max_total_host_bytes;
+static size_t total_pinned_bytes, max_total_pinned_bytes;
+
+size_t device_allocated() { return total_bytes[DEVICE]; }
+
+size_t pinned_allocated() { return total_bytes[PINNED]; }
+
+size_t mapped_allocated() { return total_bytes[MAPPED]; }
+
+size_t managed_allocated() { return total_bytes[MANAGED]; }
+
+size_t host_allocated() { return total_bytes[HOST]; }
+
+size_t device_allocated_peak() { return max_total_bytes[DEVICE]; }
+
+size_t pinned_allocated_peak() { return max_total_bytes[PINNED]; }
+
+size_t mapped_allocated_peak() { return max_total_bytes[MAPPED]; }
+
+size_t managed_allocated_peak() { return max_total_bytes[MANAGED]; }
+
+size_t host_allocated_peak() { return max_total_bytes[HOST]; }
+
+static void print_trace(void)
+{
+  void *array[10];
+  size_t size;
+  char **strings;
+  size = backtrace(array, 10);
+  strings = backtrace_symbols(array, size);
+  printf("Obtained %zd stack frames.\n", size);
+  for (size_t i = 0; i < size; i++) printf("%s\n", strings[i]);
+  free(strings);
+}
+
+static void print_alloc_header()
+{
+  printf("Type    Pointer          Size             Location\n");
+  printf("----------------------------------------------------------\n");
+}
+
+static void print_alloc(AllocType type)
+{
+  const char *type_str[] = {"Device", "Device Pinned", "Host  ", "Pinned", "Mapped", "Managed", "Shmem "};
+
+  for (auto entry : alloc[type]) {
+    void *ptr = entry.first;
+    MemAlloc a = entry.second;
+    printf("%s  %15p  %15lu  %s(), %s:%d\n", type_str[type], ptr, (unsigned long)a.base_size, a.func.c_str(),
+	   a.file.c_str(), a.line);
+  }
+}
+
+static void track_malloc(const AllocType &type, const MemAlloc &a, void *ptr)
+{
+  total_bytes[type] += a.base_size;
+  if (total_bytes[type] > max_total_bytes[type]) { max_total_bytes[type] = total_bytes[type]; }
+  if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) {
+    total_host_bytes += a.base_size;
+    if (total_host_bytes > max_total_host_bytes) { max_total_host_bytes = total_host_bytes; }
+  }
+  if (type == PINNED || type == MAPPED) {
+    total_pinned_bytes += a.base_size;
+    if (total_pinned_bytes > max_total_pinned_bytes) { max_total_pinned_bytes = total_pinned_bytes; }
+  }
+  alloc[type][ptr] = a;
+}
+
+static void track_free(const AllocType &type, void *ptr)
+{
+  size_t size = alloc[type][ptr].base_size;
+  total_bytes[type] -= size;
+  if (type != DEVICE && type != DEVICE_PINNED && type != SHMEM) { total_host_bytes -= size; }
+  if (type == PINNED || type == MAPPED) { total_pinned_bytes -= size; }
+  alloc[type].erase(ptr);
+}
+
+void *get_mapped_device_pointer_(const char *func, const char *file, int line, const void *host)
+{
+  void *device;
+  auto error = cudaHostGetDevicePointer(&device, const_cast<void *>(host), 0);
+  if (error != cudaSuccess) {
+    printf("dsaX ERROR: cudaHostGetDevicePointer failed with error %s (%s:%d in %s()", cudaGetErrorString(error), file, line,
+	   func);
+  }
+  return device;
+}
+
+bool use_managed_memory() {
+  static bool managed = false;
+  static bool init = false;
+  
+  if (!init) {
+    char *enable_managed_memory = getenv("QUDA_ENABLE_MANAGED_MEMORY");
+    if (enable_managed_memory && strcmp(enable_managed_memory, "1") == 0) {
+      printf("dsaX ERROR: Using managed memory for CUDA allocations");
+      managed = true;
+      
+      //if (!device::managed_memory_supported()) printf("dsaX WARNING: Target device does not report supporting managed memory");
+    }
+    
+    init = true;
+  }
+  
+  return managed;
+}
+
+/**
+ * Free device memory allocated with device_malloc().  This function
+ * should only be called via the device_free() macro, defined in
+ * malloc_quda.h
+ */
+void managed_free_(const char *func, const char *file, int line, void *ptr) {
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL managed pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[MANAGED].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid managed pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  cudaError_t err = cudaFree(ptr);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  track_free(MANAGED, ptr);
+}
+
+
+/**
+ * Free host memory allocated with safe_malloc(), pinned_malloc(),
+ * or mapped_malloc().  This function should only be called via the
+ * host_free() macro, defined in dsaX_malloc.h
+ */
+void host_free_(const char *func, const char *file, int line, void *ptr) {
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL host pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (alloc[HOST].count(ptr)) {
+    track_free(HOST, ptr);
+    free(ptr);
+  } else if (alloc[PINNED].count(ptr)) {
+    cudaError_t err = cudaHostUnregister(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to unregister pinned memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(PINNED, ptr);
+    free(ptr);
+  } else if (alloc[MAPPED].count(ptr)) {
+#ifdef HOST_ALLOC
+    cudaError_t err = cudaFreeHost(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to free host memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(MAPPED, ptr);
+#else
+    cudaError_t err = cudaHostUnregister(ptr);
+    if (err != cudaSuccess) {
+      printf("dsaX ERROR: Failed to unregister host-mapped memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(MAPPED, ptr);
+    free(ptr);
+#endif
+  } else {
+    printf("dsaX ERROR: Attempt to free invalid host pointer (%s:%d in %s())\n", file, line, func);
+    print_trace();
+    printf("dsaX ERROR: Aborting");
+    exit(0);
+  }
+}
+
+
+/**
+ * Perform a standard cudaMalloc() with error-checking.  This
+ * function should only be called via the device_malloc() macro,
+ * defined in dsaX_malloc.h
+ */
+void *device_malloc_(const char *func, const char *file, int line, size_t size) {
+  
+  if (use_managed_memory()) return managed_malloc_(func, file, line, size);
+
+  MemAlloc a(func, file, line);
+  void *ptr;
+
+  a.size = a.base_size = size;
+
+  cudaError_t err = cudaMalloc(&ptr, size);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+
+  // DMH: GET ON THIS! 
+  //if (is_prefetch_enabled()) dsaXMemPrefetchAsync(ptr, size, DSA_CUDA_FIELD_LOCATION, get_default_stream());
+  track_malloc(DEVICE, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+/**
+ * Free device memory allocated with device_malloc().  This function
+ * should only be called via the device_free() macro, defined in
+ * dsaX_malloc.h
+ */
+void device_free_(const char *func, const char *file, int line, void *ptr) {
+  
+  if (use_managed_memory()) {
+    managed_free_(func, file, line, ptr);
+    return;
+  }
+
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[DEVICE].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+
+  cudaError_t err = cudaFree(ptr);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  
+  track_free(DEVICE, ptr);
+}
+
+/**
+ * Free device memory allocated with device_pinned malloc().  This
+ * function should only be called via the device_pinned_free()
+ * macro, defined in dsaX_malloc.h
+ */
+void device_pinned_free_(const char *func, const char *file, int line, void *ptr) {
+
+  //DMH: I would think that we will always be using hardware with
+  //     compute >= 2.0, but this can be implemeneted later if needed.
+  //if (!comm_peer2peer_present()) {
+  //device_free_(func, file, line, ptr);
+  //return;
+  //}
+
+  if (!ptr) {
+    printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  if (!alloc[DEVICE_PINNED].count(ptr)) {
+    printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  CUresult err = cuMemFree((CUdeviceptr)ptr);
+  if (err != CUDA_SUCCESS) {
+    printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+    exit(0);
+  }
+  track_free(DEVICE_PINNED, ptr);
+}
+
+
+/**
+ * Under CUDA 4.0, cudaHostRegister seems to require that both the
+ * beginning and end of the buffer be aligned on page boundaries.
+ * This local function takes care of the alignment and gets called
+ * by pinned_malloc_() and mapped_malloc_()
+ */
+static void *aligned_malloc(MemAlloc &a, size_t size) {
+  void *ptr = nullptr;
+  
+  a.size = size;
+  
+  // we need to manually align to page boundaries to allow us to bind a texture to mapped memory
+  static int page_size = 2 * getpagesize();
+  a.base_size = ((size + page_size - 1) / page_size) * page_size; // round up to the nearest multiple of page_size                                                                                                
+  int align = posix_memalign(&ptr, page_size, a.base_size);
+  if (!ptr || align != 0) {
+    printf("Failed to allocate aligned host memory of size %zu (%s:%d in %s())\n", size, a.file.c_str(), a.line,
+	   a.func.c_str());
+    exit(0);
+  }
+  return ptr;
+}
+
+/**
+ * Perform a standard malloc() with error-checking.  This function
+ * should only be called via the safe_malloc() macro, defined in
+ * malloc_quda.h
+ */
+void *safe_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  a.size = a.base_size = size;
+  
+  void *ptr = malloc(size);
+  if (!ptr) {
+    printf("dsaX ERROR: Failed to allocate host memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(HOST, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+/**
+ * Allocate page-locked ("pinned") host memory, and map it into the
+ * GPU address space.  This function should only be called via the
+ * mapped_malloc() macro, defined in malloc_quda.h
+ */
+void *mapped_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  
+  void *ptr = aligned_malloc(a, size);
+  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterMapped | cudaHostRegisterPortable);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to register host-mapped memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(MAPPED, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, a.base_size);
+#endif
+  return ptr;
+}
+
+/**
+ * Perform a standard cudaMallocManaged() with error-checking.  This
+ * function should only be called via the managed_malloc() macro,
+ * defined in dsaX_malloc.h
+ */
+void *managed_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  void *ptr;
+
+  a.size = a.base_size = size;
+
+  cudaError_t err = cudaMallocManaged(&ptr, size);
+  if (err != cudaSuccess) {
+    printf("dsaX ERROR: Failed to allocate managed memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(MANAGED, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+
+/**
+ * Perform a cuMemAlloc with error-checking.  This function is to
+ * guarantee a unique memory allocation on the device. This
+ * should only be called via the device_pinned_malloc() macro,
+ * defined in dsaX_malloc.h.
+ */
+void *device_pinned_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  //DMH: I would think that we will always be using hardware with
+  //     compute >= 2.0, but this can be implemeneted later if needed.
+  //if (!comm_peer2peer_present()) return device_malloc_(func, file, line, size);
+  
+  MemAlloc a(func, file, line);
+  void *ptr;
+  
+  a.size = a.base_size = size;
+  
+  CUresult err = cuMemAlloc((CUdeviceptr *)&ptr, size);
+  if (err != CUDA_SUCCESS) {
+    printf("Failed to allocate device memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(DEVICE_PINNED, a, ptr);
+#ifdef HOST_DEBUG
+  cudaMemset(ptr, 0xff, size);
+#endif
+  return ptr;
+}
+
+
+/**
+ * Allocate page-locked ("pinned") host memory.  This function
+ * should only be called via the pinned_malloc() macro, defined in
+ * dsaX_malloc.h
+ *
+ * Note that we do not rely on cudaHostAlloc(), since buffers
+ * allocated in this way have been observed to cause problems when
+ * shared with MPI via GPU Direct on some systems.
+ */
+void *pinned_malloc_(const char *func, const char *file, int line, size_t size) {
+
+  MemAlloc a(func, file, line);
+  void *ptr = aligned_malloc(a, size);
+  
+  cudaError_t err = cudaHostRegister(ptr, a.base_size, cudaHostRegisterDefault);
+  if (err != cudaSuccess) {
+    printf("Failed to register pinned memory of size %zu (%s:%d in %s())\n", size, file, line, func);
+    exit(0);
+  }
+  track_malloc(PINNED, a, ptr);
+#ifdef HOST_DEBUG
+  memset(ptr, 0xff, a.base_size);
+#endif
+  return ptr;
+}
+
+namespace mem_pool {
+
+  /** Cache of inactive pinned-memory allocations.  We cache pinned
+      memory allocations so that fields can reuse these with minimal
+      overhead.
+  */
+  static std::multimap<size_t, void *> pinnedCache;
+
+  /** Sizes of active pinned-memory allocations.  For convenience,
+      we keep track of the sizes of active allocations (i.e., those not
+      in the cache). 
+  */
+  static std::map<void *, size_t> pinnedSize;
+  
+  /** Cache of inactive device-memory allocations.  We cache pinned
+      memory allocations so that fields can reuse these with minimal
+      overhead.
+  */
+  static std::multimap<size_t, void *> deviceCache;
+  
+  /** Sizes of active device-memory allocations.  For convenience,
+      we keep track of the sizes of active allocations (i.e., those not
+      in the cache). 
+  */
+  static std::map<void *, size_t> deviceSize;
+  
+  static bool pool_init = false;
+  
+  /** whether to use a memory pool allocator for device memory */
+  static bool device_memory_pool = true;
+  
+  /** whether to use a memory pool allocator for pinned memory */
+  static bool pinned_memory_pool = true;
+
+  void init() {
+    if (!pool_init) {
+      // device memory pool
+      char *enable_device_pool = getenv("DSAX_ENABLE_DEVICE_MEMORY_POOL");
+      if (!enable_device_pool || strcmp(enable_device_pool, "0") != 0) {
+	printf("dsaX Warning: Using device memory pool allocator");
+	device_memory_pool = true;
+      } else {
+	printf("dsaX Warning: Not using device memory pool allocator");
+	device_memory_pool = false;
+      }
+      
+      // pinned memory pool
+      char *enable_pinned_pool = getenv("DSAX_ENABLE_PINNED_MEMORY_POOL");
+      if (!enable_pinned_pool || strcmp(enable_pinned_pool, "0") != 0) {
+	printf("dsaX Warning: Using pinned memory pool allocator");
+	pinned_memory_pool = true;
+      } else {
+	printf("dsaX Warning: Not using pinned memory pool allocator");
+	pinned_memory_pool = false;
+      }
+      pool_init = true;
+    }
+  }
+  void *pinned_malloc_(const char *func, const char *file, int line, size_t nbytes) {
+    void *ptr = nullptr;
+    if (pinned_memory_pool) {
+      if (pinnedCache.empty()) {
+	ptr = pinned_malloc_(func, file, line, nbytes);
+      } else {
+	auto it = pinnedCache.lower_bound(nbytes);
+	if (it != pinnedCache.end()) { // sufficiently large allocation found
+	  nbytes = it->first;
+	  ptr = it->second;
+	  pinnedCache.erase(it);
+	} else { // sacrifice the smallest cached allocation
+	  it = pinnedCache.begin();
+	  ptr = it->second;
+	  pinnedCache.erase(it);
+	  host_free(ptr);
+	  ptr = pinned_malloc_(func, file, line, nbytes);
+	}
+      }
+      pinnedSize[ptr] = nbytes;
+    } else {
+      ptr = pinned_malloc_(func, file, line, nbytes);
+    }
+    return ptr;
+  }
+    
+  void pinned_free_(const char *func, const char *file, int line, void *ptr) {
+    if (pinned_memory_pool) {
+      if (!pinnedSize.count(ptr)) {
+	printf("dsaX Error: Attempt to free invalid pointer");
+	exit(0);
+      }
+      pinnedCache.insert(std::make_pair(pinnedSize[ptr], ptr));
+      pinnedSize.erase(ptr);
+    } else {
+      host_free_(func, file, line, ptr);
+    }
+  }
+
+  void *device_malloc_(const char *func, const char *file, int line, size_t nbytes) {
+    void *ptr = nullptr;
+    if (device_memory_pool) {
+      if (deviceCache.empty()) {
+	ptr = device_malloc_(func, file, line, nbytes);
+      } else {
+	auto it = deviceCache.lower_bound(nbytes);
+	if (it != deviceCache.end()) { // sufficiently large allocation found
+	  nbytes = it->first;
+	  ptr = it->second;
+	  deviceCache.erase(it);
+	} else { // sacrifice the smallest cached allocation
+	  it = deviceCache.begin();
+	  ptr = it->second;
+	  deviceCache.erase(it);
+	  device_free_(func, file, line, ptr);
+	  ptr = device_malloc_(func, file, line, nbytes);
+	}
+      }
+      deviceSize[ptr] = nbytes;
+    } else {
+      ptr = device_malloc_(func, file, line, nbytes);
+    }
+    return ptr;
+  }
+
+  /**
+   * Free device memory allocated with device_pinned malloc().  This
+   * function should only be called via the device_pinned_free()
+   * macro, defined in malloc_quda.h
+   */
+  void device_pinned_free_(const char *func, const char *file, int line, void *ptr) {
+    //DMH: I would think that we will always be using hardware with
+    //     compute >= 2.0, but this can be implemeneted later if needed
+    //if (!comm_peer2peer_present()) {
+    //device_free_(func, file, line, ptr);
+    //return;
+    //}
+
+    if (!ptr) {
+      printf("dsaX ERROR: Attempt to free NULL device pointer (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    if (!alloc[DEVICE_PINNED].count(ptr)) {
+      printf("dsaX ERROR: Attempt to free invalid device pointer (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    CUresult err = cuMemFree((CUdeviceptr)ptr);
+    if (err != CUDA_SUCCESS) {
+      printf("dsaX ERROR: Failed to free device memory (%s:%d in %s())\n", file, line, func);
+      exit(0);
+    }
+    track_free(DEVICE_PINNED, ptr);
+  }
+
+  
+  void device_free_(const char *func, const char *file, int line, void *ptr) {
+    if (device_memory_pool) {
+      if (!deviceSize.count(ptr)) {
+	printf("dsaX Error: Attempt to free invalid pointer");
+	exit(0);
+      }
+      deviceCache.insert(std::make_pair(deviceSize[ptr], ptr));
+      deviceSize.erase(ptr);
+    } else {
+      device_free_(func, file, line, ptr);
+    }
+  }
+  
+  void flush_pinned() {
+    if (pinned_memory_pool) {
+      for (auto it : pinnedCache) { host_free(it.second); }
+      pinnedCache.clear();
+    }
+  }
+  
+  void flush_device() {
+    if (device_memory_pool) {
+      for (auto it : deviceCache) { device_free(it.second); }
+      deviceCache.clear();
+    }
+  }  
+} // namespace pool
diff --git a/src/params.cpp b/src/params.cpp
new file mode 100644
index 0000000..723264c
--- /dev/null
+++ b/src/params.cpp
@@ -0,0 +1,104 @@
+#include <iostream>
+
+#include "params.h"
+
+using namespace std;
+
+const char *getBLASLibString(dsaXBLASLib lib)
+{
+  const char *ret;
+
+  switch (lib) {
+  case DSA_BLAS_LIB_CUBLAS: ret = "CUBLAS"; break;
+  case DSA_BLAS_LIB_MAGMA: ret = "MAGMA"; break;
+  case DSA_BLAS_LIB_CUTLASS: ret = "CUTLAS"; break;
+  case DSA_BLAS_LIB_OPENBLAS: ret = "OPENBLAS"; break;
+  case DSA_BLAS_LIB_NATIVE: ret = "NATIVE"; break;
+  default: ret = "unknown"; break;
+  }
+  
+  return ret;
+}
+
+const char *getBLASDataTypeString(dsaXBLASDataType type)
+{
+  const char *ret;
+
+  switch (type) {
+  case DSA_BLAS_DATATYPE_H: ret = "Half"; break;
+  case DSA_BLAS_DATATYPE_S: ret = "Single"; break;
+  case DSA_BLAS_DATATYPE_D: ret = "Double"; break;
+  case DSA_BLAS_DATATYPE_HC: ret = "Complex(half)"; break;
+  case DSA_BLAS_DATATYPE_C: ret = "Complex(single)"; break;
+  case DSA_BLAS_DATATYPE_Z: ret = "Complex(double)"; break;
+  case DSA_BLAS_DATATYPE_4b_REAL: ret = "4b sized real"; break;
+  case DSA_BLAS_DATATYPE_2b_REAL: ret = "2b sized real"; break;
+  case DSA_BLAS_DATATYPE_4b_COMPLEX: ret = "Char sized complex (4b,4b)"; break;
+  case DSA_BLAS_DATATYPE_2b_COMPLEX: ret = "4b sized (2b,2b)"; break;  
+  default: ret = "unknown"; break;
+  }
+
+  return ret;
+}
+
+const char *getBLASDataOrderString(dsaXBLASDataOrder order)
+{
+  const char *ret;
+
+  switch (order) {
+  case DSA_BLAS_DATAORDER_ROW: ret = "Row order"; break;
+  case DSA_BLAS_DATAORDER_COL: ret = "Column order"; break;
+  default: ret = "unknown"; break;
+  }
+  
+  return ret;
+}
+
+void printDsaXCorrParam(const dsaXCorrParam param) {
+
+  cout << " --- dsaXCorrParam begin ---" << endl;
+  cout << "struct_size = " << param.struct_size << endl;
+  cout << "blas_lib    = " << getBLASLibString(param.blas_lib) << endl;
+  cout << "data_type   = " << getBLASDataTypeString(param.data_type) << endl;
+  cout << "data_order  = " << getBLASDataOrderString(param.data_order) << endl;
+  cout << "n_streams   = " << param.n_streams << endl;
+  
+  cout << " --- dsaXCorrParam end ---" << endl;
+}
+
+void printDsaXBLASParam(const dsaXBLASParam param) {
+
+  cout << " --- dsaXBLASParam begin ---" << endl;
+  cout << "struct_size = " << param.struct_size << endl;
+  cout << "blas_type = " << param.blas_type << endl;
+  cout << "blas_lib = " << getBLASLibString(param.blas_lib) << endl;
+  cout << "data_type = " << getBLASDataTypeString(param.data_type) << endl;
+  cout << "data_order = " << getBLASDataOrderString(param.data_order) << endl;
+  cout << "trans_a = " << param.trans_a << endl;
+  cout << "trans_b = " << param.trans_b << endl;
+  cout << "m = " << param.m << endl;
+  cout << "n = " << param.n << endl;
+  cout << "k = " << param.k << endl;
+  cout << "lda = " << param.lda << endl;
+  cout << "ldb = " << param.ldb << endl;
+  cout << "ldc = " << param.ldc << endl;
+  cout << "a_offset = " << param.a_offset << endl;
+  cout << "b_offset = " << param.b_offset << endl;
+  cout << "c_offset = " << param.c_offset << endl;
+  cout << "a_stride = " << param.a_stride << endl;
+  cout << "b_stride = " << param.b_stride << endl;
+  cout << "c_stride = " << param.c_stride << endl;
+  cout << "alpha = " << param.alpha << endl;
+  cout << "beta = " << param.beta << endl;
+  cout << "batch_count = " << param.batch_count << endl;
+  cout << " --- dsaXBLASParam end ---" << endl;
+}
+
+dsaXCorrParam newDsaXCorrParam(void) {
+  dsaXCorrParam new_param;
+  new_param.struct_size = sizeof(new_param);
+  new_param.blas_lib = DSA_BLAS_LIB_INVALID;
+  new_param.data_type = DSA_BLAS_DATATYPE_INVALID;
+  new_param.data_order = DSA_BLAS_DATAORDER_INVALID;
+  return new_param;
+}
diff --git a/src/planar_complex.cu b/src/planar_complex.cu
new file mode 100644
index 0000000..3fb8175
--- /dev/null
+++ b/src/planar_complex.cu
@@ -0,0 +1,87 @@
+/*
+#include <iostream>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/core_io.h>
+
+int main() {
+
+  cutlass::half_t x = 2.25_hf;
+
+  std::cout << x << std::endl;
+
+  return 0;
+}
+*/
+
+#include <cutlass/numeric_types.h>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/util/host_tensor.h>
+
+int main() {
+
+  // Define the GEMM operation
+  using Gemm = cutlass::gemm::device::Gemm<
+    cutlass::half_t,                           // ElementA
+    cutlass::layout::ColumnMajor,              // LayoutA
+    cutlass::half_t,                           // ElementB
+    cutlass::layout::ColumnMajor,              // LayoutB
+    cutlass::half_t,                           // ElementOutput
+    cutlass::layout::ColumnMajor,              // LayoutOutput
+    float,                                     // ElementAccumulator
+    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
+    cutlass::arch::Sm75                        // tag indicating target GPU compute architecture
+  >;
+
+  Gemm gemm_op;
+  cutlass::Status status;
+
+  //
+  // Define the problem size
+  //
+  int M = 512;
+  int N = 256;
+  int K = 128;
+
+  float alpha = 1.25f;
+  float beta = -1.25f;
+
+  //
+  // Allocate device memory
+  //
+
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> A({M, K});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> B({K, N});
+  cutlass::HostTensor<cutlass::half_t, cutlass::layout::ColumnMajor> C({M, N});
+
+  cutlass::half_t const *ptrA = A.device_data();
+  cutlass::half_t const *ptrB = B.device_data();
+  cutlass::half_t const *ptrC = C.device_data();
+  cutlass::half_t       *ptrD = C.device_data();
+
+  int lda = A.device_ref().stride(0);
+  int ldb = B.device_ref().stride(0);
+  int ldc = C.device_ref().stride(0);
+  int ldd = C.device_ref().stride(0);
+  //
+  // Launch GEMM on the device
+  //
+ 
+  status = gemm_op({
+    {M, N, K},
+    {ptrA, lda},            // TensorRef to A device tensor
+    {ptrB, ldb},            // TensorRef to B device tensor
+    {ptrC, ldc},            // TensorRef to C device tensor
+    {ptrD, ldd},            // TensorRef to D device tensor - may be the same as C
+    {alpha, beta}           // epilogue operation arguments
+  });
+
+  if (status != cutlass::Status::kSuccess) {
+    return -1;
+  } else {
+    std::cout << "CUTLASS Success! " << std::endl;
+  }
+  
+  return 0;
+}
diff --git a/src/psrdada_utils.cpp b/src/psrdada_utils.cpp
new file mode 100644
index 0000000..3978ecd
--- /dev/null
+++ b/src/psrdada_utils.cpp
@@ -0,0 +1,11 @@
+#include "psrdada_utils.h"
+
+void dsaX_dbgpu_cleanup(dada_hdu_t * in, dada_hdu_t * out)
+{
+  if (dada_hdu_unlock_read (in) < 0) syslog(LOG_ERR, "could not unlock read on hdu_in");
+  dada_hdu_destroy (in);
+  
+  if (dada_hdu_unlock_write (out) < 0) syslog(LOG_ERR, "could not unlock write on hdu_out");
+  dada_hdu_destroy (out);
+  
+} 
diff --git a/src/splice_offline_beams b/src/splice_offline_beams
deleted file mode 100755
index 728af8c..0000000
Binary files a/src/splice_offline_beams and /dev/null differ
diff --git a/src/utils.cpp b/src/utils.cpp
new file mode 100644
index 0000000..cc4194d
--- /dev/null
+++ b/src/utils.cpp
@@ -0,0 +1,59 @@
+#include <iostream>
+
+#include "utils.h"
+#include "enums.h"
+#include "params.h"
+#include "cuda_interface.h"
+
+using namespace std;
+
+void dsaXmemset(void *array, int ch, size_t n){
+#ifdef DSA_XENGINE_TARGET_CUDA
+  dsaXmemsetCuda(array, ch, n);
+#else
+  memset(array, ch, n);
+#endif
+}
+
+void dsaXmemcpy(void *array_out, void *array_in, size_t n, dsaXMemcpyKind kind, int stream){
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+  // Perform host to device memcopy on data
+  dsaXmemcpyCuda(array_out, array_in, n, kind, stream);
+#else  
+  memcpy(array_out, array_in, n);
+#endif
+}
+
+void dsaXDeviceSynchronize() {
+#ifdef DSA_XENGINE_TARGET_CUDA
+  // Synchronise the device
+  dsaXDeviceSynchronizeCuda();
+#else  
+  // NO OP
+#endif
+}
+
+void initDsaXCorrDeviceMemory(corr_handle *d, unsigned int n_streams) {
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+  d->dev_malloc_timer.start();
+  initializeCorrCudaMemory(d, n_streams);
+  d->dev_malloc_timer.stop();
+#else  
+  cout << "dsaX Error: Not implemented." << endl;
+  exit(0);
+#endif  
+}
+
+void destroyDsaXCorrDeviceMemory(corr_handle *d) {
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+  d->dev_malloc_timer.start();
+  deallocateCorrCudaMemory(d);
+  d->dev_malloc_timer.stop();
+#else
+  cout << "dsaX Error: Not implemented." << endl;
+  exit(0);
+#endif  
+}
diff --git a/src/version.cpp b/src/version.cpp
new file mode 100644
index 0000000..1c8114b
--- /dev/null
+++ b/src/version.cpp
@@ -0,0 +1,5 @@
+#ifdef GITVERSION
+const char* gitversion = GITVERSION ;
+#else
+const char* gitversion;
+#endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..64aa8db
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,8 @@
+#DMH: fix include path
+include_directories(${CMAKE_SOURCE_DIR}/include)
+include_directories(${CLI11_SOURCE_DIR}/include/CLI)
+
+add_library(dsaX_tests command_line_params.cpp)
+
+add_executable(correlator_test correlator_test.cpp)
+target_link_libraries(correlator_test dsaX dsaX_tests)
diff --git a/tests/command_line_params.cpp b/tests/command_line_params.cpp
new file mode 100644
index 0000000..82c02e8
--- /dev/null
+++ b/tests/command_line_params.cpp
@@ -0,0 +1,56 @@
+#include <command_line_params.h>
+
+// General 
+int core = 0;
+bool debug = false;
+
+// Data block HDU keys 
+key_t in_key = 0x0000eada;  // REORDER_BLOCK_KEY in dsaX_def.h
+key_t out_key = 0x0000fada; // XGPU_BLOCK_KEY in dsaX_def.h
+
+// Test params
+bool run_beamformer = false;
+bool run_correlator = false;
+bool input_rands = true;
+bool write_output = false;
+int test_iter = 1;
+int n_streams = 8;
+
+// Test files
+std::string input_filename = "input.dat";
+std::string output_filename = "output.dat";
+
+// DSA hardware configuration
+int n_channels = 384;
+int n_antennae = 63;
+int n_pol = 2;
+int n_times = 30720;
+double start_frequency = 1498.75;
+
+std::shared_ptr<dsaXApp> make_app(std::string app_description, std::string app_name) {
+
+  auto dsaX_app = std::make_shared<dsaXApp>(app_description, app_name);
+  dsaX_app->option_defaults()->always_capture_default();
+
+  dsaX_app->add_option("--core", core, "Bind process to this CPU core [default 0]");
+  dsaX_app->add_option("--debug", debug, "Send debug messages to syslog");
+  dsaX_app->add_option("--in-key", in_key, "[default REORDER_BLOCK_KEY]");
+  dsaX_app->add_option("--out-key", out_key, "[default XGPU_BLOCK_KEY]");
+  dsaX_app->add_option("--run-beamformer", run_beamformer, "Run the beamformer [default false]");
+  dsaX_app->add_option("--run-correlator", run_correlator, "Run the correlator [default false]");
+  dsaX_app->add_option("--test-iter", test_iter, "Run the test 'test_iter' times [default 1]");
+  dsaX_app->add_option("--write-output", write_output, "Write output to disk [default true]");
+  dsaX_app->add_option("--n-streams", n_streams, "The number of device streams [default 10]");
+  
+  // Input file options
+  dsaX_app->add_option("--input-rands", input_rands, "Generate random input (default false)");
+  dsaX_app->add_option("--input-filename", input_filename, "Name of file on which to run tests");
+
+  dsaX_app->add_option("--output-filename", output_filename, "Name of file on which to write results");
+  dsaX_app->add_option("--n-channels", n_channels, "Number of frequency channels [default 384]");
+  dsaX_app->add_option("--n-antennae", n_antennae, "Number of antennae [default 63]");
+  dsaX_app->add_option("--n-pol", n_pol, "Number of polarizations [default 2]");
+  dsaX_app->add_option("--n-times", n_times, "Number of times [default 30720]");
+  dsaX_app->add_option("--start-frequency", start_frequency, "start frequency (assumes 1498.75)");
+  return dsaX_app;
+}
diff --git a/tests/command_line_params.h b/tests/command_line_params.h
new file mode 100644
index 0000000..fb9bd1a
--- /dev/null
+++ b/tests/command_line_params.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <CLI.hpp>
+
+class dsaXApp : public CLI::App {
+  
+public:
+  dsaXApp(std::string app_description = "", std::string app_name = "") : CLI::App(app_description, app_name) {};
+  
+  virtual ~dsaXApp() {};
+};
+
+std::shared_ptr<dsaXApp> make_app(std::string app_description = "dsaX internal test", std::string app_name = "");
+
+// General 
+extern int core;
+extern bool debug;
+
+// Data block HDU keys 
+extern key_t in_key;
+extern key_t out_key;
+
+// Test mode
+extern bool run_beamformer;
+extern bool run_correlator;
+extern bool input_rands;
+extern bool write_output;
+extern int test_iter;
+extern int n_streams;
+
+// DSA hardware configureation
+extern std::string input_filename;
+extern std::string output_filename;
+extern int n_channels;
+extern int n_antennae;
+extern int n_pol;
+extern int n_times;
+extern double start_frequency;
diff --git a/tests/correlator_test.cpp b/tests/correlator_test.cpp
new file mode 100644
index 0000000..3cdc699
--- /dev/null
+++ b/tests/correlator_test.cpp
@@ -0,0 +1,367 @@
+#include <unistd.h> //DMH: replace with CLI
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <time.h>
+#include <math.h>
+#include <string.h>
+#include <syslog.h>
+#include <random>
+
+using namespace std;
+
+// Include this file to access input parameters
+#include "command_line_params.h"
+
+// Include the dsaX.h header in your application
+#include <dsaX.h>
+
+// Include this file to access test utilities
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr float precision real array
+ * @param[out] ini float precision imag array
+ * @param[in]  input char precision complex array
+ * @param[in]  rows number of rows
+ * @param[in]  cols number of cols
+ */
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) {
+  
+#pragma omp parallel for collapse(2)
+  int idx = 0;
+  for(int i=0; i<rows; i++) {
+    for(int j=0; j<cols; j++) {
+      idx = i * cols + j;
+      
+      // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+      // to get real part 4 bit data.
+      // 0000rrrr
+      // Bit shift this result by 4 to the left.
+      // rrrr0000
+      // Cast to signed char.
+      // +-rrr0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000rrr
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx] = (prec)((char)((   (unsigned char)(input[idx]) & (unsigned char)(15)  ) << 4) >> 4);
+      
+      // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+      // to get imag part 4 bit data
+      // iiii0000.
+      // Cast to signed char
+      // +-iii0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000iii
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx+1] = (prec)((char)((   (unsigned char)(input[idx]) & (unsigned char)(240)  )) >> 4);
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) {
+  
+#pragma omp parallel for collapse(2)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      // Get C index
+      int C_idx_r = 2*(i * n + j);
+      int C_idx_i = 2*(i * n + j) + 1;
+      C[C_idx_r] = 0.0;
+      C[C_idx_i] = 0.0;
+      for(int l=0; l<k; l++) {
+
+	// A is conjugated
+	int A_idx_r = 2*(l * m + i);
+	int A_idx_i = 2*(l * m + i) + 1;
+	
+	int B_idx_r = 2*(l * n + j);
+	int B_idx_i = 2*(l * n + j) + 1;
+
+	// Compute Adag * B = C
+	C[C_idx_r] += A[A_idx_r] * B[B_idx_r] + A[A_idx_i] * B[B_idx_i];
+	C[C_idx_i] += A[A_idx_r] * B[B_idx_i] - A[A_idx_i] * B[B_idx_r];
+      }
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n) {
+
+  prec frob_norm = 0.0;
+  
+#pragma omp parallel for collapse(2) reduction (+:frob_norm)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+
+      // Get Cdag index
+      int Cd_idx_r = 2*(j * m + i);
+      int Cd_idx_i = 2*(j * m + i) + 1;
+      
+      // Get C index
+      int C_idx_r = 2*(i * n + j);
+      int C_idx_i = 2*(i * n + j) + 1;
+
+      double diff = pow((C[C_idx_r] - C[Cd_idx_r]), 2);
+      diff       += pow((C[C_idx_i] + C[Cd_idx_i]), 2);
+      frob_norm = frob_norm + diff;
+    }
+  }
+  return frob_norm/(m*n*2);
+}
+
+/*
+// The class offers entire file content read/write in single operation
+class BinaryFileVector : public vector<char>
+{
+public:
+
+  using vector<char>::vector;
+
+  bool loadFromFile(const char *fileName) noexcept
+  {
+    // Try to open a file specified by its name    
+    ifstream file(fileName, ios::in | ios::binary);
+    if (!file.is_open() || file.bad())
+      return false;
+
+    // Clear whitespace removal flag
+    file.unsetf(ios::skipws);
+
+    // Determine size of the file
+    file.seekg(0, ios_base::end);
+    size_t fileSize = file.tellg();
+    file.seekg(0, ios_base::beg);
+
+    // Discard previous vector content
+    resize(0);
+    reserve(0);
+    shrink_to_fit();
+
+    // Order to prealocate memory to avoid unnecessary reallocations due to vector growth
+    reserve(fileSize);
+
+    // Read entire file content into prealocated vector memory
+    insert(begin(),
+	   istream_iterator<char>(file),
+	   istream_iterator<char>());
+
+    // Make sure entire content is loaded
+    if(size() == fileSize) {
+      cout << "Successfully read file of size " << fileSize << endl;
+      return true;
+    } else {
+      cout << "Unexpected file size." << endl;
+      return false;
+    }
+  }
+
+  bool saveToFile(const char *fileName) const noexcept
+  {
+    // Write entire vector content into a file specified by its name
+    ofstream file(fileName, ios::out | ios::binary);
+    try {
+      file.write((const char *) data(), size());
+    }
+    catch (...) {
+      return false;
+    }
+
+    // Determine number of bytes successfully stored in file
+    size_t fileSize = file.tellp();
+    if(size() == fileSize) {
+      cout << "Successfully wrote file of size " << fileSize  << endl;
+      return true;
+    } else {
+      cout << "Unexpected file size." << endl;
+      return false;
+    }
+  }
+};
+*/
+int main(int argc, char **argv) {
+
+  // Parse command line
+  auto app = make_app();  
+  try {
+    app->parse(argc, argv);
+  } catch (const CLI::ParseError &e) {
+    return app->exit(e);
+  }
+  
+  int device_ordinal = 0;
+  int packet_size = 4608;
+
+  // Create a data array for a single call to the correlator class
+  FILE *fin, *fout;
+  uint64_t sz, in_block_size, rd_size;
+  in_block_size = NPACKETS_PER_BLOCK*NANTS*NCHAN_PER_PACKET*2*2;
+  
+  cout << "Creating char file_array of size " << (1.0*sizeof(char)*in_block_size)/pow(1024,2) << " MB." << endl;
+  char *file_data = (char *)malloc(in_block_size);  
+
+  // read one block of input data  
+  // get size of file
+  if(!input_rands) {
+    cout << "attempting to read file " << input_filename.c_str() << endl; 
+    fin = fopen(input_filename.c_str(), "rb");
+    fseek(fin, 0L, SEEK_END);
+    sz = ftell(fin);
+    if(sz != packet_size) {
+      cout << "Error: packet size " << packet_size << " and file size " << sz << " are unequal." << endl;
+      exit(0);
+    }
+    rewind(fin);
+
+    // figure out how many reps and chunks to read with
+    int nreps, nchunks;
+    if (sz > in_block_size) {
+      nreps = (int)(sz/in_block_size);
+      rd_size = in_block_size;
+    }
+    else {
+      nchunks = (int)(in_block_size/sz);
+      rd_size = sz;
+    }
+
+    cout << "Packet size = " << sz << endl;
+    cout << "rd size = " << rd_size << endl;
+    for (int reps = 0; reps<nreps; reps++) {
+      for (int chunks = 0; chunks < nchunks; chunks++) {	
+	fread(file_data + (chunks + reps * nchunks)*rd_size , rd_size, 1, fin);
+      }
+    }
+  } else {
+    int n_rand = in_block_size/sizeof(uint64_t);
+    uint64_t *input_rand = (uint64_t*)malloc(n_rand);
+
+    random_device rd;
+    mt19937_64 gen(rd());
+    gen.seed(1234);
+    uniform_int_distribution<uint64_t> dis;
+    for (int i = 0; i < n_rand; i++) input_rand[i] = dis(gen);
+    //for (int i = 0; i < n_rand; i++) input_rand[i] = (uint64_t)1234;
+    memcpy(file_data, (void*)input_rand, n_rand);
+    free(input_rand);
+  }
+  
+  // Start dsaX program
+  //---------------------------------------
+  timer::Timer<chrono::microseconds, chrono::high_resolution_clock> test_timer;
+
+  dsaXInit(device_ordinal);
+  
+  // Create Correlator class instance.
+  dsaXCorrParam param = newDsaXCorrParam();
+  param.blas_lib = DSA_BLAS_LIB_CUBLAS;
+  param.data_type = DSA_BLAS_DATATYPE_4b_COMPLEX;
+  param.data_order = DSA_BLAS_DATAORDER_ROW;
+  param.n_streams = n_streams;
+  printDsaXCorrParam(param);
+  
+  auto correlator = new Correlator(&param);
+
+  // Create GPU registered memory if using CUDA 
+  uint64_t input_size = n_streams*sizeof(char)*in_block_size;
+  cout << "Creating char input array of size " << input_size << " bytes." << endl;
+  void *input_data = dsaXHostRegister(input_size);
+  // Populate with random data. Each stream has the same data
+  // To ensure the concurrency does not pollute accross streams. 
+  for (int i = 0; i<n_streams; i++) memcpy((char*)input_data + i * in_block_size, file_data, in_block_size);
+
+  // Create GPU registered output array
+  uint64_t output_size = n_streams * sizeof(float) * NBASE*NCHAN_PER_PACKET*2*2;
+  cout << "Creating float output_array of size " << output_size << " bytes." << endl;
+  void *output_data = dsaXHostRegister(output_size);
+
+  /*
+  float *A = (float*)dsaXHostRegister(2*sizeof(float)*96*512);
+  float *B = (float*)dsaXHostRegister(2*sizeof(float)*96*512);
+  float *C = (float*)dsaXHostRegister(2*sizeof(float)*96*96);
+  promoteComplexCharToFloat(A, file_data, 512, 96);
+  promoteComplexCharToFloat(B, file_data, 512, 96);  
+  host_MdagM_gemm(A, B, C, 96, 96, 512); 
+  */
+    
+  // Ensure test output array is zero
+  memset(output_data, 0, output_size);
+  
+  cout << "Total input size = " << (1.0 * input_size)/pow(1024,3) << " GB." << endl;
+  cout << "Expected output size = " << (1.0 * output_size)/pow(1024,3) << " GB." << endl;
+  
+  test_timer.start();  
+  correlator->compute(output_data, input_data);
+  test_timer.stop();
+
+  float frob_norm = test_hermiticity((float*)output_data, 96, 96);
+  cout << "Frobenius norm = " << frob_norm << endl;
+  
+  //cout << "Output peek " << endl;
+  float *p = (float*)output_data;
+  for(int i=0; i<8; i++) cout << "output[" << i << "] = " << p[i] << endl;
+  
+  if(write_output) {
+    fout = fopen(output_filename.c_str(),"ab");
+    fwrite((unsigned char *)output_data, sizeof(unsigned char *), sizeof(float)*output_size, fout);
+    fclose(fout);
+  }
+  
+  delete correlator;
+  dsaXEnd();
+
+  cout << "Test time = " << (1.0*test_timer.elapsed().count())/(1e6) << " seconds. " << endl;
+  
+  // End dsaX program
+  //---------------------------------------
+
+  // free local data
+  free(input_data);
+  free(output_data);
+  return 0;
+  
+  /*  
+  // Read data
+  BinaryFileVector binaryFileVector;
+
+  
+  if (!binaryFileVector.loadFromFile(test_filename.c_str())) {
+    cout << "Failed to read the file." << endl;
+    return 0;
+  }
+  
+  // read one block of input data
+  for (int i=0;i<512;i++) {
+    //fin = fopen(test_filename,"rb");
+    //fread(input_data + i*4*NANTS*NCHAN_PER_PACKET*2*2, 4*NANTS*NCHAN_PER_PACKET*2*2, 1, fin);
+    //fclose(fin);
+  }
+
+  for (int i=0;i<512;i++) {
+    memcpy(input_data + i*binaryFileVector.size(), binaryFileVector.data(), binaryFileVector.size());
+  }
+  
+  // Peek at input data (delete after development is complete)
+  for (int i=0; i<8; i++) inspectPackedData(input_data[i], i);  
+
+  // Peek at output data (delete after development is complete)
+  for (int i=0; i<NBASE*NCHAN_PER_PACKET*2*2; i++) if(output_data[i] != 0) cout << "output " << i << " = " << output_data[i] << endl;
+  //for (int i=0; i<8; i++) cout << "output " << i << " = " << output_data[i] << endl; 
+
+  if (!binaryFileVector.saveToFile("output.dat")) {
+    cout << "Failed to write a file." << endl;
+    return 0;
+  } else {
+    cout << "Successfully wrote file." << endl;
+  }
+  
+  
+  fout = fopen("output.dat","wb");
+  fwrite((float *)output_data, sizeof(float), NBASE*NCHAN_PER_PACKET*2*2, fout);
+  fclose(fout);
+  */
+      
+
+}
diff --git a/tests/utils.cpp b/tests/utils.cpp
new file mode 100644
index 0000000..bc10104
--- /dev/null
+++ b/tests/utils.cpp
@@ -0,0 +1,89 @@
+#include "utils.h"
+
+/**
+ * Promote complex char riri... data to planar half rr.. ii.. 
+ *
+ * @param[out] inr float precision real array
+ * @param[out] ini float precision imag array
+ * @param[in]  input char precision complex array
+ * @param[in]  rows number of rows
+ * @param[in]  cols number of cols
+ */
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols) {
+  
+#pragma omp parallel for collapse(2)
+  int idx = 0;
+  for(int i=0; i<cols; i++) {
+    for(int j=0; j<rows; j++) {
+      int idx = i * rows + j;
+      
+      // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+      // to get real part 4 bit data.
+      // 0000rrrr
+      // Bit shift this result by 4 to the left.
+      // rrrr0000
+      // Cast to signed char.
+      // +-rrr0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000rrr
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx] = (prec)((char)((   (unsigned char)(input[2*idx]) & (unsigned char)(15)  ) << 4) >> 4);
+      
+      // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+      // to get imag part 4 bit data
+      // iiii0000.
+      // Cast to signed char
+      // +-iii0000
+      // Bitshift mantisa only to the right by 4 bits
+      // +-0000iii
+      // Cast to float and use CUDA intrinsic to cast to signed half
+      output[2*idx+1] = (prec)((char)((   (unsigned char)(input[2*idx+1]) & (unsigned char)(240)  )) >> 4);
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k) {
+  
+#pragma omp parallel for collapse(2)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      // Get C index
+      int C_idx = i * n + j;
+      C[2*C_idx]   = 0.0;
+      C[2*C_idx+1] = 0.0;
+      for(int l=0; l<k; l++) {
+	
+	int A_idx = l + m + i;
+	int B_idx = l * n + j;
+
+	// Compute Adag * B = C
+	C[2*C_idx]   += A[2*A_idx] * B[2*B_idx] + A[2*A_idx+1] * B[2*B_idx+1];
+	C[2*C_idx+1] += A[2*A_idx] * B[2*B_idx+1] - A[2*A_idx+1] * B[2*B_idx];
+      }
+    }
+  }
+}
+
+// Assume ROW ordered data in interleaved format
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n) {
+
+  prec frob_norm = 0.0;
+  
+#pragma omp parallel for collapse(2) reduction (+:frob_norm)
+  for(int i=0; i<m; i++) {
+    for(int j=0; j<n; j++) {
+      
+      int C_idx  = i + n * j;
+      int Cd_idx = j + m * i;
+
+      double diff = pow((C[2*C_idx] - C[2*Cd_idx]), 2);
+      diff       += pow((C[2*C_idx+1] - C[2*Cd_idx+1]), 2);
+      frob_norm = frob_norm + diff;
+      
+ 
+    }
+  }
+  return frob_norm;
+}
diff --git a/tests/utils.h b/tests/utils.h
new file mode 100644
index 0000000..354e196
--- /dev/null
+++ b/tests/utils.h
@@ -0,0 +1,5 @@
+#pragma once
+
+template <typename prec> void promoteComplexCharToFloat(prec *output, const char *input, const int rows, const int cols);
+template <typename prec> void host_MdagM_gemm(const prec *A, const prec *B, prec *C, const int m, const int n, const int k);
+template <typename prec> prec test_hermiticity(const prec *C, const int m, const int n);
diff --git a/utils/.gitignore b/utils/.gitignore
deleted file mode 100644
index dafcc02..0000000
--- a/utils/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-antennas.out
-gen_antennas.py
diff --git a/utils/gen_packet.py b/utils/gen_packet.py
index 8803832..7ae8ab4 100644
--- a/utils/gen_packet.py
+++ b/utils/gen_packet.py
@@ -84,8 +84,8 @@ def histo_test(data):
 n_packet = 4608 # 4608 for single packet
 
 # decide which sort of packet to make
-noise = True
-tone = False
+noise = False
+tone = True
 x16 = False
 
 # if tone
@@ -110,8 +110,8 @@ def histo_test(data):
     # make packet
     real_part = np.zeros(n_packet,dtype='int8')
     imag_part = np.zeros(n_packet,dtype='int8')
-    for ant in [0,1,2]:
-        for i in chans:
+    for ant in [0,1,2]: # 3 antennae
+        for i in chans: # 384 channels
 
             # time 1 pol A
             j = int(1536*ant + i*4)
diff --git a/utils/gen_testblock.py b/utils/gen_testblock.py
index ab607b2..2eb0f6e 100644
--- a/utils/gen_testblock.py
+++ b/utils/gen_testblock.py
@@ -1,6 +1,6 @@
 import numpy as np, struct
 import matplotlib.pyplot as plt
-
+import os
 
 ''' The aim here is to make data blocks to test the bfCorr code. 
 
@@ -9,69 +9,42 @@
 
 Structure of a block is [2048 packets, 32 channel groups, ...]
 
+
+We want the real and imagniary parts to be random integers over 
+the range of [-8, 7]
+=======
 '''
 
 # defaults
 outfile = 'block.out'
-n_packet = 4608 # 4608 for single packet
-npackets = 4
-nchangs = 32
+if os.path.exists(outfile):
+    os.remove(outfile)
+    
 
-# make a block where every s
-chans = np.arange(384)#np.asarray([10,100,190]
-v1 = 1
-v2 = 2
-v3 = 3
-v4 = 4
-v5 = 5
-v6 = 6
+num_packets = 4
+n_antennae = 3
+n_chans = 384
+n_changs = 32
 
-vals = [-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7]
+# make values in the range vals = [-8, 7]
 # [NCHAN_PER_PACKET/8, NPACKETS_PER_BLOCK/4, 4tim, NANTS/2, 8chan, 2 times, 2 pol, 4-bit complex]
 
-for ipacket in np.arange(npackets):
-
-    print(ipacket)
-    ant_number = 0
-    for ichang in np.arange(nchangs):
-
-        real_part = np.zeros(n_packet,dtype='int8')
-        imag_part = np.zeros(n_packet,dtype='int8')
-
-        for i in np.arange(3):
-            for j in np.arange(384):
-                for k in np.arange(4):
-
-                    #v1 = 32.*(j/384.+0.8)*np.random.normal()
-                    #v2 = 32.*(j/384.+0.8)*np.random.normal()
 
-                    v1 = 32.*(((j % 9)-5)+(i+ipacket-3))
-#                    if i==0:
-#                        if k==0:
-#                            if ipacket==0:
-#                                print(j,v1/32.)
-                    v2 = 0.
-                    ii = i*1536+j*4+k
-
-                    real_part[ii] = v1
-                    imag_part[ii] = v2
-
-        # make 4-bit versions
-        real_part = np.cast['uint8'](real_part)
-        imag_part = np.cast['uint8'](imag_part)
-        for i in range(n_packet):
-            real_part[i]  = real_part[i] >> 4
-            imag_part[i]  = (imag_part[i] >> 4) << 4
-
-        # finish packet
-        packet = np.zeros(n_packet,dtype='uint8')
-        for i in range(n_packet):
-            packet[i] = real_part[i] | imag_part[i]
-                
-        out_str = packet.tobytes()
+for ipacket in np.arange(num_packets):
 
+    print(ipacket)
+    for ichang in np.arange(n_changs):
+
+        packet = np.zeros(num_packets*n_changs, dtype='uint8')
+        for i in np.arange(n_antennae):
+            for j in np.arange(n_chans):
+                for k in np.arange(num_packets):
+
+                    # we now make a randon integer iunt8 format
+                    idx = ichang + n_changs*ipacket
+                    packet[idx] = np.random.randint(0, 256)
+                    
+        out_str = packet.tobytes()        
         newFile = open(outfile, "ab")
         newFile.write(out_str)
         newFile.close()
-
-
diff --git a/utils/packet.out b/utils/packet.out
deleted file mode 100644
index 435ed74..0000000
Binary files a/utils/packet.out and /dev/null differ
diff --git a/utils/test.out b/utils/test.out
deleted file mode 100644
index d684e88..0000000
Binary files a/utils/test.out and /dev/null differ