diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..440c6f9
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,321 @@
+####################################################################################
+# START 1. Basic setup for cmake
+####################################################################################
+# basic setup for cmake
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+if(POLICY CMP0074)
+ cmake_policy(SET CMP0074 NEW)
+endif()
+
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON)
+set(CMAKE_COLOR_MAKEFILE ON)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+# Disable gnu exentions
+set(CMAKE_CXX_EXTENSIONS ON)
+
+# Define the project
+project("DSA_XENGINE" VERSION 1.0.0 LANGUAGES C CXX)
+
+# For GCC 8 and lower, set -pthread flag manually
+set(CMAKE_C_FLAGS "-pthread")
+set(CMAKE_CXX_FLAGS "-pthread")
+
+# add a directory for cmake modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
+# DSA_XENGINE may be built to run using CUDA or CPU. Future version may be
+# written for HIP or SYCL, which we call the
+# Target type. By default, the target is CUDA.
+#---------------------------------------------
+
+# Set by environment variable if visible
+if(DEFINED ENV{DSA_XENGINE_TARGET})
+ set(DEFTARGET $ENV{DSA_XENGINE_TARGET})
+else()
+ set(DEFTARGET "CUDA")
+endif()
+
+set(VALID_TARGET_TYPES CUDA CPU) #HIP SYCL
+set(DSA_XENGINE_TARGET_TYPE "${DEFTARGET}" CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}")
+set_property(CACHE DSA_XENGINE_TARGET_TYPE PROPERTY STRINGS "CUDA" "CPU") # HIP SYCL
+
+string(TOUPPER ${DSA_XENGINE_TARGET_TYPE} CHECK_TARGET_TYPE)
+list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID)
+
+if(TARGET_TYPE_VALID LESS 0)
+ message(SEND_ERROR "Please specify a valid DSA_XENGINE_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}")
+endif()
+
+# Git helpers
+#------------
+find_package(Git)
+if(GIT_FOUND)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} show
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ RESULT_VARIABLE IS_GIT_REPOSIITORY
+ OUTPUT_QUIET ERROR_QUIET)
+ if(${IS_GIT_REPOSIITORY} EQUAL 0)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} describe --abbrev=0
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ OUTPUT_VARIABLE GITTAG
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but
+ # that might not always be available
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ COMMAND wc -l
+ OUTPUT_VARIABLE GITCOUNT
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ execute_process(
+ COMMAND ${GIT_EXECUTABLE} describe --match 1 --always --long --dirty
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+ OUTPUT_VARIABLE GITVERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE)
+ endif()
+endif(GIT_FOUND)
+
+
+option(DSA_XENGINE_BUILD_ALL_TESTS "build tests by default" ON)
+option(DSA_XENGINE_INSTALL_ALL_TESTS "install tests by default" ON)
+option(DSA_XENGINE_BUILD_SHAREDLIB "build dsaXengine as a shared lib" ON)
+
+
+# Use ExternalProject_Add for libtcc (borks with FetchContent)
+# Use ExternalProject_Add for CUTLASS (long build time, version 2.11.0 for sm_8x arch)
+include(ExternalProject)
+
+# Use FetchContent for lightweight dependencies
+include(FetchContent)
+
+# CUDA based dependencies and options
+#------------------------------------
+if(DSA_XENGINE_TARGET_TYPE STREQUAL CUDA)
+
+ # CUDA specific part of CMakeLists
+ enable_language(CUDA)
+ find_package(CUDAToolkit REQUIRED)
+
+ # Get GPU architecture from environmen, or set default (sm_80)
+ if(DEFINED ENV{DSA_XENGINE_GPU_ARCH})
+ set(DSA_XENGINE_DEFAULT_GPU_ARCH $ENV{DSA_XENGINE_GPU_ARCH})
+ else()
+ set(DSA_XENGINE_DEFAULT_GPU_ARCH sm_80)
+ endif()
+
+ if(NOT DSA_XENGINE_GPU_ARCH)
+ message(STATUS "Building DSA_XENGINE for GPU ARCH " "${DSA_XENGINE_DEFAULT_GPU_ARCH}")
+ endif()
+
+ set(DSA_XENGINE_GPU_ARCH
+ ${DSA_XENGINE_DEFAULT_GPU_ARCH}
+ CACHE STRING "set the GPU architecture (sm_60, sm_70, sm_80 sm_90)")
+ set_property(CACHE DSA_XENGINE_GPU_ARCH PROPERTY STRINGS sm_60 sm_70 sm_80 sm_90)
+ set(DSA_XENGINE_GPU_ARCH_SUFFIX
+ ""
+ CACHE STRING "set the GPU architecture suffix (virtual, real). Leave empty for no suffix.")
+ set_property(CACHE DSA_XENGINE_GPU_ARCH_SUFFIX PROPERTY STRINGS "real" "virtual" " ")
+ #set(CMAKE_CUDA_ARCHITECTURES ${DSA_XENGINE_GPU_ARCH})
+ #mark_as_advanced(DSA_XENGINE_GPU_ARCH_SUFFIX)
+ #mark_as_advanced(CMAKE_CUDA_ARCHITECTURES)
+
+ # Set CUDA based methods and dependencies
+ #----------------------------------------
+
+ # This is the default GPU method
+ option(DSA_XENGINE_ENABLE_CUBLAS "Use cuBLAS for correlatorss" ON)
+
+ # All other GPU methods can be enabled at compile time and
+ # toggled for use at run time, if enabled.
+
+ # Get TCC dependency
+ option(DSA_XENGINE_ENABLE_TCC "Use TensorCoreCorrelators for correlatorss" OFF)
+ if(DSA_XENGINE_ENABLE_TCC)
+ add_compile_definitions(DSA_XENGINE_ENABLE_TCC)
+ option(DSA_XENGINE_DOWNLOAD_TCC "Download, build, link (and install) TCC" OFF)
+ if(DSA_XENGINE_DOWNLOAD_TCC)
+ ExternalProject_Add(TCC
+ GIT_REPOSITORY https://git.astron.nl/RD/tensor-core-correlator
+ #GIT_TAG 11d8a4a504d7073a2a33b81e1e387b12e58a420c
+ CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+ )
+ else()
+ find_package(libtcc REQUIRED)
+ endif()
+ endif()
+
+ # Get CUTLASS dependency
+ option(DSA_XENGINE_ENABLE_CUTLASS "Use CUTLASS for GEMMs" OFF)
+ if(DSA_XENGINE_ENABLE_CUTLASS)
+ add_compile_definitions(DSA_XENGINE_ENABLE_CUTLASS)
+ option(DSA_XENGINE_DOWNLOAD_CUTLASS "Download, build (only the required kernels) link (and install) CUTLASS" OFF)
+ if(DSA_XENGINE_DOWNLOAD_CUTLASS)
+ # Custom CUTLASS build
+ ExternalProject_Add(NvidiaCutlass
+ GIT_REPOSITORY https://github.com/NVIDIA/cutlass.git
+ GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+ CMAKE_ARGS
+ "-DCUTLASS_NVCC_ARCHS_ENABLED=89"
+ "-DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_*gemm_planar_complex"
+ "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+ )
+ else()
+ find_package(NvidiaCutlass REQUIRED)
+ endif()
+ endif()
+
+ # Get MAGMA dependency
+ option(DSA_XENGINE_ENABLE_MAGMA "Use MAGMA for GEMMs" OFF)
+ if(DSA_XENGINE_ENABLE_MAGMA)
+ add_compile_definitions(DSA_XENGINE_ENABLE_MAGMA)
+ option(DSA_XENGINE_DOWNLOAD_MAGMA "Download, build (only the required kernels) link (and install) MAGMA" OFF)
+ if(DSA_XENGINE_DOWNLOAD_MAGMA)
+ # Custom MAGMA build
+ ExternalProject_Add(Magma
+ URL https://icl.utk.edu/projectsfiles/magma/downloads/magma-2.8.0.tar.gz
+ CMAKE_ARGS
+ "-DMAGMA_ENABLE_CUDA=ON"
+ "-DGPU_TARGET=sm_80"
+ "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+ )
+ else()
+ find_package(Magma REQUIRED)
+ endif()
+ endif()
+
+ # Get XGPU dependency (fix install)
+ option(DSA_XENGINE_ENABLE_XGPU "Use xGPU for correlatorss" OFF)
+ if(DSA_XENGINE_ENABLE_XGPU)
+ add_compile_definitions(DSA_XENGINE_ENABLE_XGPU)
+ option(DSA_XENGINE_DOWNLOAD_XGPU "Download and build xGPU" OFF)
+ if(DSA_XENGINE_DOWNLOAD_XGPU)
+ # Download, build and install
+ FetchContent_Declare(
+ xGPU
+ GIT_REPOSITORY https://github.com/cpviolator/xGPU.git
+ #GIT_TAG 13b7fff1eac497236eb9c38e179aed3b532a88f2
+ )
+ FetchContent_MakeAvailable(XGPU)
+ else()
+ # Find and link to local install
+ find_package(xGPU REQUIRED)
+ endif()
+ endif()
+
+endif() # CUDA functionality
+
+# Get CPU based dependencies
+# Get OPENBLAS dependency
+option(DSA_XENGINE_ENABLE_OPENBLAS "Use OPENBLAS for GEMMs" OFF)
+if(DSA_XENGINE_ENABLE_OPENBLAS)
+ add_compile_definitions(DSA_XENGINE_ENABLE_OPENBLAS)
+ option(DSA_XENGINE_DOWNLOAD_OPENBLAS "Download, build, link, and install OPENBLAS" OFF)
+ if(DSA_XENGINE_DOWNLOAD_OPENBLAS)
+ # Custom OPENBLAS build
+ ExternalProject_Add(Openblas
+ GIT_REPOSITORY https://github.com/OpenMathLib/OpenBLAS.git
+ GIT_TAG ce3f668
+ CMAKE_ARGS
+ #"-DOPENBLAS_ENABLE_CUDA=ON"
+ #"-DGPU_TARGET=sm_80"
+ "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}"
+ )
+ else()
+ find_package(Openblas REQUIRED)
+ endif()
+endif()
+
+# Get psrdada dependency
+option(DSA_XENGINE_ENABLE_PSRDADA "Use PSRDada for IO" ON)
+option(DSA_XENGINE_DOWNLOAD_PSRDADA "Download and build PSRDada" ON)
+if(DSA_XENGINE_DOWNLOAD_PSRDADA)
+ # Download, build and install
+ FetchContent_Declare(
+ PSRDada
+ GIT_REPOSITORY git://git.code.sf.net/p/psrdada/code
+ )
+ FetchContent_MakeAvailable(PSRDada)
+else()
+ # Find and link to local install
+ find_package(PSRDada REQUIRED)
+endif()
+
+# Get HDF5 dependency
+option(DSA_XENGINE_ENABLE_HDF5 "Use HDF5 for data IO" OFF)
+if(DSA_XENGINE_ENABLE_HDF5)
+ option(DSA_XENGINE_DOWNLOAD_HDF5 "Download and build HDf5" OFF)
+ if(DSA_XENGINE_DOWNLOAD_HDF5)
+ # Download, build and install
+ FetchContent_Declare(
+ HDF5
+ GIT_REPOSITORY https://github.com/HDFGroup/hdf5.git
+ GIT_TAG 5794814
+ )
+ FetchContent_MakeAvailable(HDF5)
+ else()
+ # Find and link to local install
+ find_package(HDF5 REQUIRED)
+ endif()
+endif()
+
+# Get CLI11 dependency
+# FIX ME: get static .hpp version and ship with package
+option(DSA_XENGINE_ENABLE_CLI11 "Enable CLI11 (required)" ON)
+if(DSA_XENGINE_ENABLE_CLI11)
+ option(DSA_XENGINE_DOWNLOAD_CLI11 "Download and build CLI11" ON)
+ if(DSA_XENGINE_DOWNLOAD_CLI11)
+ # Download, build and install
+ FetchContent_Declare(
+ CLI11
+ GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+ GIT_TAG main
+ )
+ FetchContent_MakeAvailable(CLI11)
+ else()
+ # Find and link to local install
+ find_package(CLI11 REQUIRED)
+ endif()
+endif()
+
+
+# Get ZFP dependency
+option(DSA_XENGINE_ENABLE_ZFP "Enable ZFP" OFF)
+if(DSA_XENGINE_ENABLE_ZFP)
+ option(DSA_XENGINE_DOWNLOAD_ZFP "Download and build ZFP" OFF)
+ if(DSA_XENGINE_DOWNLOAD_ZFP)
+ # Download, build and install
+ FetchContent_Declare(
+ ZFP
+ GIT_REPOSITORY https://github.com/LLNL/zfp.git
+ GIT_TAG f40868a
+ )
+ FetchContent_MakeAvailable(ZFP)
+ else()
+ # Find and link to local install
+ find_package(ZFP REQUIRED)
+ endif()
+endif()
+
+# Add src, include, tests, and legacy
+add_subdirectory(src)
+add_subdirectory(include)
+add_subdirectory(tests)
+option(DSA_XENGINE_BUILD_LEGACY "Build legacy code (will not install if built)" OFF)
+if(DSA_XENGINE_BUILD_LEGACY)
+ add_subdirectory(legacy)
+endif()
+
+# Install project cmake targets
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+ ${PROJECT_NAME}-config-version.cmake
+ VERSION ${DSA_XENGINE_VERSION}
+ COMPATIBILITY AnyNewerVersion
+ )
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake
+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
+ )
diff --git a/README.md b/README.md
index 03fe5e3..f771017 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
# dsa110-xengine
+
+
+
+
This repo contains code used for the DSA X-engine. The requirements are to:
- capture SNAP F-engine packets on an ethernet interface, and place them in a psrdada buffer
@@ -67,11 +71,4 @@ Finally, `dsaX_dbnic` and `dsaX_nicdb` implement the corner turn to feed `mbheim
### scripts and utils
-The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights.
-
-
-
-
-
-
-
+The "scripts" dir contains some useful scripts to test various aspects of the system (corr, bf, cornerturn). The "utils" dir includes functionality to generate fake data and beamforming weights.
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
new file mode 100644
index 0000000..9a7cbbd
--- /dev/null
+++ b/include/CMakeLists.txt
@@ -0,0 +1,21 @@
+enable_language(CUDA)
+
+# install step for header files
+#------------------------------
+set(DSA_XENGINE_HEADERS
+ # cmake-format: sortable
+ dsaX.h
+ dsaX_def.h
+ dsaX_malloc.h
+ dsaX_ptr.h
+ fast_time_domain.h
+ cuda_interface.h
+ cuda_handles.h
+ cuda_headers.h
+ dsaX_capture.h
+ dsaX_capture_manythread.h
+ dsaX_capture_pcap.h
+ cutlass_interface.h
+ )
+install(FILES ${DSA_XENGINE_HEADERS} DESTINATION include)
+#------------------------------
diff --git a/include/blas_interface.h b/include/blas_interface.h
new file mode 100644
index 0000000..d643e08
--- /dev/null
+++ b/include/blas_interface.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "interface.h"
+
+void dsaXHgemmStridedBatched(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream = 0);
diff --git a/include/cublas_interface.h b/include/cublas_interface.h
new file mode 100644
index 0000000..f68eea3
--- /dev/null
+++ b/include/cublas_interface.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "dsaX.h"
+
+void dsaXHgemmStridedBatchedCuda(void *real_a, void *imag_a, void *real_b, void *imag_b, void *real_c, void *imag_c, dsaXBLASParam param, int stream);
diff --git a/include/cuda_handles.h b/include/cuda_handles.h
new file mode 100644
index 0000000..eeaf706
--- /dev/null
+++ b/include/cuda_handles.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include
+
+#include "utils.h"
+
+#ifdef DSA_XENGINE_TARGET_CUDA
+#include "cuda_headers.h"
+
+static std::vector streams;
+static cublasHandle_t cublasH = NULL;
+
+static bool cublas_init = false;
+static bool stream_init = false;
+
+cudaStream_t get_stream(unsigned int i);
+#endif
+
+void init_streams(unsigned int n_streams);
+void destroy_streams();
diff --git a/include/cuda_headers.h b/include/cuda_headers.h
new file mode 100644
index 0000000..333a5bc
--- /dev/null
+++ b/include/cuda_headers.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#if defined (DSA_XENGINE_TARGET_CUDA)
+#include
+#include "cuda_fp16.h"
+#include
+#include
+#endif
diff --git a/include/cuda_interface.h b/include/cuda_interface.h
new file mode 100644
index 0000000..42043e2
--- /dev/null
+++ b/include/cuda_interface.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include
+
+#include "dsaX_def.h"
+#include "enums.h"
+#include "dsaX.h"
+
+void dsaXInitCuda(int dev);
+void dsaXDestroyCuda();
+
+void initBLASCuda();
+void destroyBLASCuda();
+
+void initStreamsCuda(unsigned int n);
+void destroyStreamsCuda();
+
+void promoteComplexCharToPlanarHalfCuda(corr_handle *d, unsigned int stream);
+
+void initializeCorrCudaMemory(corr_handle *d, unsigned int n_streams);
+
+void initializeBFCudaMemory(bf_handle *d);
+
+void deallocateCorrCudaMemory(corr_handle *d);
+
+void deallocateBFCudaMemory(bf_handle *d);
+
+void dsaXmemsetCuda(void *array, int ch, size_t n);
+
+void dsaXmemcpyCuda(void *array_device, void *array_host, size_t n, dsaXMemcpyKind kind, int stream);
+
+void *dsaXHostRegisterCuda(size_t size);
+
+void dsaXDeviceSynchronizeCuda();
+
+void reorderCorrOutputCuda(corr_handle *d, int stream);
+
+void computeIndicesCuda(corr_handle *d);
+
+void reorderCorrInputCuda(corr_handle *d, int stream);
+
+void calcWeightsCuda(bf_handle *d);
+
+template void transposeMatrixCuda(in_prec *idata, out_prec *odata);
+
+void transposeInputBeamformerCuda(double *idata, double *odata, std::vector &dim_block_in, std::vector &dim_grid_in);
+
+void transposeScaleBeamformerCuda(void *real, void *imag, unsigned char *output, std::vector &dim_block_in, std::vector &dim_grid_in);
+
+void fluffInputBeamformerCuda(char *input, void *b_real, void *b_imag, int blocks, int tpb);
+
+void sumBeamCuda(unsigned char *input, float *output, int blocks, int tpb);
diff --git a/include/cuda_kernels.h b/include/cuda_kernels.h
new file mode 100644
index 0000000..d57a11b
--- /dev/null
+++ b/include/cuda_kernels.h
@@ -0,0 +1,340 @@
+#pragma once
+
+#include "cuda_headers.h"
+
+__global__ void inspectPackedDataInKernel(char input, int i) {
+ float re = (float)((char)(( (unsigned char)(input) & (unsigned char)(15) ) << 4) >> 4);
+ float im = (float)((char)(( (unsigned char)(input) & (unsigned char)(240))) >> 4);
+
+ if(re != 0 || im != 0) printf("K val[%d] = (%f,%f)\n", i, re, im);
+}
+
+// KERNELS
+// DMH: Abstract hardcoded launch parameters
+__global__ void transpose_input_beamformer(double *idata, double *odata) {
+
+ __shared__ double tile[16][17][4];
+
+ int x = blockIdx.x * 16 + threadIdx.x;
+ int y = blockIdx.y * 16 + threadIdx.y;
+ int width = gridDim.x * 16;
+
+ for (int j = 0; j < 16; j += 8) {
+ tile[threadIdx.y+j][threadIdx.x][0] = idata[4*((y+j)*width + x)];
+ tile[threadIdx.y+j][threadIdx.x][1] = idata[4*((y+j)*width + x)+1];
+ tile[threadIdx.y+j][threadIdx.x][2] = idata[4*((y+j)*width + x)+2];
+ tile[threadIdx.y+j][threadIdx.x][3] = idata[4*((y+j)*width + x)+3];
+ }
+
+ __syncthreads();
+
+ x = blockIdx.y * 16 + threadIdx.x; // transpose block offset
+ y = blockIdx.x * 16 + threadIdx.y;
+ width = gridDim.y * 16;
+
+ for (int j = 0; j < 16; j += 8) {
+ odata[4*((y+j)*width + x)] = tile[threadIdx.x][threadIdx.y + j][0];
+ odata[4*((y+j)*width + x)+1] = tile[threadIdx.x][threadIdx.y + j][1];
+ odata[4*((y+j)*width + x)+2] = tile[threadIdx.x][threadIdx.y + j][2];
+ odata[4*((y+j)*width + x)+3] = tile[threadIdx.x][threadIdx.y + j][3];
+ }
+}
+
+// kernel to help with reordering output
+// outr and outi are [NANTS, NANTS, NCHAN_PER_PACKET, 2time, 2pol, halfFac]
+// run with NCHAN_PER_PACKET*2*NBASE/128 blocks of 128 threads
+__global__ void corr_output_copy(half *outr, half *outi, float *output, int *indices_lookup) {
+
+ int bidx = blockIdx.x; // assume NCHAN_PER_PACKET*2*NBASE/128
+ int tidx = threadIdx.x; // assume 128
+ int idx = blockDim.x * bidx + tidx;
+
+ int baseline = (int)(idx / (NCHAN_PER_PACKET * 2));
+ int chpol = (int)(idx % (NCHAN_PER_PACKET * 2));
+ int ch = (int)(chpol / 2);
+ int base_idx = indices_lookup[baseline];
+ int iidx = base_idx * NCHAN_PER_PACKET + ch;
+ int pol = (int)(chpol % 2);
+
+ float v1=0., v2=0.;
+
+ //if(idx<1) printf("output pre (%f, %f)\n", output[2*idx], output[2*idx+1]);
+
+ // Use CUDA casting intrinsic __half2float
+ for (int i=0;i __global__ void transpose_matrix(in_prec * idata, out_prec * odata) {
+
+ __shared__ in_prec tile[32][33];
+
+ int x = blockIdx.x * 32 + threadIdx.x;
+ int y = blockIdx.y * 32 + threadIdx.y;
+ int width = gridDim.x * 32;
+
+ for (int j = 0; j < 32; j += 8) {
+ tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+ //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+ }
+
+ __syncthreads();
+
+ x = blockIdx.y * 32 + threadIdx.x; // transpose block offset
+ y = blockIdx.x * 32 + threadIdx.y;
+ width = gridDim.y * 32;
+
+ for (int j = 0; j < 32; j += 8) {
+ odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+ //inspectPackedDataInKernel(odata[(y+j)*width + x], (y+j)*width + x);
+ }
+}
+
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_float(half * idata, half * odata) {
+
+ __shared__ float tile[32][33];
+
+ int x = blockIdx.x * 32 + threadIdx.x;
+ int y = blockIdx.y * 32 + threadIdx.y;
+ int width = gridDim.x * 32;
+
+ for (int j = 0; j < 32; j += 8) {
+ tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+ //printf("K transpose_matrix_float_in[%d] = %f\n", (y+j)*width + x, __half2float(idata[(y+j)*width + x]));
+ }
+
+ __syncthreads();
+
+ x = blockIdx.y * 32 + threadIdx.x; // transpose block offset
+ y = blockIdx.x * 32 + threadIdx.y;
+ width = gridDim.y * 32;
+
+ for (int j = 0; j < 32; j += 8) {
+ odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+ //printf("K transpose_matrix_float_out[%d] = %f\n", (y+j)*width + x, __half2float(odata[(y+j)*width + x]));
+ }
+}
+
+
+// DMH: TUNABLE
+// transpose kernel
+// assume breakdown into tiles of 32x32, and run with 32x8 threads per block
+// launch with dim3 dimBlock(32, 8) and dim3 dimGrid(Width/32, Height/32)
+// here, width is the dimension of the fastest index
+__global__ void transpose_matrix_char(char * idata, char * odata) {
+
+ __shared__ char tile[32][33];
+ //extern __shared__ char tile[];
+
+ int x = blockIdx.x * blockDim.x + threadIdx.x;
+ int y = blockIdx.y * blockDim.x + threadIdx.y;
+ int width = gridDim.x * blockDim.x;
+
+ for (int j = 0; j < blockDim.x; j += blockDim.y) {
+ tile[threadIdx.y+j][threadIdx.x] = idata[(y+j)*width + x];
+ //tile[(threadIdx.y+j)*blockDim.x + threadIdx.x] = idata[(y+j)*width + x];
+ //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+ }
+
+ __syncthreads();
+
+ x = blockIdx.y * blockDim.x + threadIdx.x; // transpose block offset
+ y = blockIdx.x * blockDim.x + threadIdx.y;
+ width = gridDim.y * blockDim.x;
+
+ for (int j = 0; j < blockDim.x; j += blockDim.y) {
+ odata[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+ //odata[(y+j)*width + x] = tile[threadIdx.x + blockDim.x*(threadIdx.y + j)];
+ //inspectPackedDataInKernel(idata[(y+j)*width + x], (y+j)*width + x);
+ }
+}
+
+
+/**
+ * Promote complex char riri... data to planar half rr.. ii..
+ *
+ * @param[out] inr Half precision real array
+ * @param[out] ini Half precision imag array
+ * @param[in] input Char precision complex array
+ */
+__global__ void promoteComplexCharToPlanarHalf(char *input, half *inr, half *ini) {
+
+ int bidx = blockIdx.x;
+ int tidx = threadIdx.x;
+ int iidx = blockDim.x * bidx + tidx;
+
+ // 15 in unsigned char binary is 00001111. Perform bitwise & on 15 and input char data iiiirrrr
+ // to get real part 4 bit data.
+ // 0000rrrr
+ // Bit shift this result by 4 to the left.
+ // rrrr0000
+ // Cast to signed char.
+ // +-rrr0000
+ // Bitshift mantisa only to the right by 4 bits
+ // +-0000rrr
+ // Cast to float and use CUDA intrinsic to cast to signed half
+ inr[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(15) ) << 4) >> 4));
+
+ // 240 in unsigned char binary is 11110000. Perform bitwise & on 240 and input char data iiiirrrr
+ // to get imag part 4 bit data
+ // iiii0000.
+ // Cast to signed char
+ // +-iii0000
+ // Bitshift mantisa only to the right by 4 bits
+ // +-0000iii
+ // Cast to float and use CUDA intrinsic to cast to signed half
+ ini[iidx] = __float2half((float)((char)(( (unsigned char)(input[iidx]) & (unsigned char)(240) )) >> 4));
+
+ //good
+ //if(__half2float(inr[iidx]) != 0 || __half2float(ini[iidx]) != 0) printf("corr_input_copy %i = (%f,%f)\n", iidx, __half2float(inr[iidx]), __half2float(ini[iidx]));
+}
+
+// kernel to populate an instance of weights matrix
+// [2, (NCHAN_PER_PACKET/8), NBEAMS/2, 4times*(NANTS/2)*8chan*2tim*2pol]
+// run with 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)/128 blocks of 128 threads
+// TUNABLE
+__global__ void populate_weights_matrix(float * antpos_e, float * antpos_n, float * calibs, half * wr, half * wi, float * fqs) {
+
+ int bidx = blockIdx.x;
+ int tidx = threadIdx.x;
+ int inidx = 128 * bidx + tidx;
+
+ // 2*(NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)
+
+ // get indices
+ int iArm = (int)(inidx / ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+ int iidx = (int)(inidx % ((NCHAN_PER_PACKET/8)*(NBEAMS/2)*128*(NANTS/2)));
+ int fq = (int)(iidx / (128*(NANTS/2)*(NBEAMS/2)));
+ int idx = (int)(iidx % (128*(NANTS/2)*(NBEAMS/2)));
+ int bm = (int)(idx / (128*(NANTS/2)));
+ int tactp = (int)(idx % (128*(NANTS/2)));
+ //int t = (int)(tactp / (32*(NANTS/2)));
+ int actp = (int)(tactp % (32*(NANTS/2)));
+ int a = (int)(actp / 32);
+ int ctp = (int)(actp % 32);
+ //int c = (int)(ctp / 4);
+ int tp = (int)(ctp % 4);
+ //int t2 = (int)(tp / 2);
+ int pol = (int)(tp % 2);
+ int widx = (a+48*iArm)*(NCHAN_PER_PACKET/8)*2*2 + fq*2*2 + pol*2;
+
+ // calculate weights
+ float theta, afac, twr, twi;
+ if (iArm==0) {
+ theta = sep*(127.-bm*1.)*PI/10800.; // radians
+ afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+ twr = cos(afac*antpos_e[a+48*iArm]);
+ twi = sin(afac*antpos_e[a+48*iArm]);
+ wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+ wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+ //wr[inidx] = __float2half(calibs[widx]);
+ //wi[inidx] = __float2half(calibs[widx+1]);
+ }
+ if (iArm==1) {
+ theta = sep*(127.-bm*1.)*PI/10800.; // radians
+ afac = -2.*PI*fqs[fq]*theta/CVAC; // factor for rotate
+ twr = cos(afac*antpos_n[a+48*iArm]);
+ twi = sin(afac*antpos_n[a+48*iArm]);
+ wr[inidx] = __float2half((twr*calibs[widx] - twi*calibs[widx+1]));
+ wi[inidx] = __float2half((twi*calibs[widx] + twr*calibs[widx+1]));
+ //wr[inidx] = __float2half(calibs[widx]);
+ //wi[inidx] = __float2half(calibs[widx+1]);
+ }
+}
+
+// kernel to fluff input bf data
+// run with NPACKETS_PER_BLOCK*(NANTS/2)*NCHAN_PER_PACKET*2*2/128 blocks of 128 threads
+__global__ void fluff_input_beamformer(char * input, half * dr, half * di) {
+
+ int bidx = blockIdx.x;
+ int tidx = threadIdx.x;
+ int idx = blockDim.x * bidx + tidx;
+
+ dr[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(15)) << 4) >> 4)));
+ di[idx] = __float2half(0.015625*((float)((char)(((unsigned char)(input[idx]) & (unsigned char)(240))) >> 4)));
+
+ // Both results should be half (FP16) integers between -8 and 7.
+ //half re = dr[idx];
+ //half im = di[idx];
+ //half lim = 0;
+ //if( (re > lim || re < -lim) || (im > lim || im < -lim)) {
+ //printf("re = %f, im = %f\n", __half2float(re), __half2float(im));
+ //}
+}
+
+// transpose, add and scale kernel for bf
+// assume breakdown into tiles of 16x16, and run with 16x8 threads per block
+// launch with dim3 dimBlock(16, 8) and dim3 dimGrid((NBEAMS/2)*(NPACKETS_PER_BLOCK/4)/16, (NCHAN_PER_PACKET/8)/16)
+// scf is a per-beam scale factor to enable recasting as unsigned char
+__global__ void transpose_scale_beamformer(half * ir, half * ii, unsigned char * odata) {
+
+ __shared__ float tile[16][17];
+
+ int x = blockIdx.x * 16 + threadIdx.x;
+ int y = blockIdx.y * 16 + threadIdx.y;
+ int width = gridDim.x * 16;
+ float dr, di;
+
+ for (int j = 0; j < 16; j += 8) {
+ dr = (float)(ir[(y+j)*width + x]);
+ di = (float)(ii[(y+j)*width + x]);
+ tile[threadIdx.y+j][threadIdx.x] = (dr*dr+di*di);
+ }
+
+ __syncthreads();
+
+ x = blockIdx.y * 16 + threadIdx.x; // transpose block offset
+ y = blockIdx.x * 16 + threadIdx.y;
+ width = gridDim.y * 16;
+
+ for (int j = 0; j < 16; j += 8)
+ odata[(y+j)*width + x] = (unsigned char)(tile[threadIdx.x][threadIdx.y + j]/128.);
+
+}
+
+// sum over all times in output beam array
+// run with (NCHAN_PER_PACKET/8)*(NBEAMS/2) blocks of (NPACKETS_PER_BLOCK/4) threads
+__global__ void sum_beam(unsigned char *input, float *output) {
+
+ __shared__ float summ[512];
+ int bidx = blockIdx.x;
+ int tidx = threadIdx.x;
+ //int idx = bidx*256+tidx;
+ int bm = (int)(bidx/48);
+ int ch = (int)(bidx % 48);
+
+ summ[tidx] = (float)(input[bm*256*48 + tidx*48 + ch]);
+
+ __syncthreads();
+
+ if (tidx<256) {
+ summ[tidx] += summ[tidx+256];
+ summ[tidx] += summ[tidx+128];
+ summ[tidx] += summ[tidx+64];
+ summ[tidx] += summ[tidx+32];
+ summ[tidx] += summ[tidx+16];
+ summ[tidx] += summ[tidx+8];
+ summ[tidx] += summ[tidx+4];
+ summ[tidx] += summ[tidx+2];
+ summ[tidx] += summ[tidx+1];
+ }
+
+ if (tidx==0) output[bidx] = summ[tidx];
+}
diff --git a/include/cutlass_interface.h b/include/cutlass_interface.h
new file mode 100644
index 0000000..f95eeaa
--- /dev/null
+++ b/include/cutlass_interface.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include
+#include
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/device/gemm_planar_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/library/handle.h"
+
+using namespace cutlass;
+using namespace gemm;
+using namespace library;
+using namespace layout;
+using namespace reference;
+using namespace device;
+
+// Result structure
+struct Result {
+
+ double runtime_ms;
+ double gflops;
+ Status status;
+ cudaError_t error;
+ bool passed;
+
+ Result(double runtime_ms = 0, double gflops = 0, Status status = Status::kSuccess, cudaError_t error = cudaSuccess):
+ runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+// Command line options parsing (testing)
+struct Options {
+
+ bool help;
+ GemmCoord problem_size;
+ int batch_count;
+ complex alpha;
+ complex beta;
+ bool reference_check;
+ int iterations;
+
+ Options():
+ help(false),
+ problem_size({1024, 1024, 1024}),
+ batch_count(256),
+ reference_check(false),
+ iterations(2),
+ alpha(1),
+ beta(0) { }
+
+ // Parses the command line
+ void parse(int argc, char const **args) {
+
+ CommandLine cmd(argc, args);
+ if (cmd.check_cmd_line_flag("help")) {
+ help = true;
+ }
+
+ cmd.get_cmd_line_argument("m", problem_size.m());
+ cmd.get_cmd_line_argument("n", problem_size.n());
+ cmd.get_cmd_line_argument("k", problem_size.k());
+ cmd.get_cmd_line_argument("batch", batch_count);
+
+ cmd.get_cmd_line_argument("alpha", alpha.real());
+ cmd.get_cmd_line_argument("alpha_i", alpha.imag());
+ cmd.get_cmd_line_argument("beta", beta.real());
+ cmd.get_cmd_line_argument("beta_i", beta.imag());
+
+ cmd.get_cmd_line_argument("iterations", iterations);
+ }
+
+ /// Prints the usage statement.
+ std::ostream & print_usage(std::ostream &out) const {
+
+ out << "dsaX_cutlass_interface\n\n"
+ << " This example uses the CUTLASS Library to execute Planar Complex Array GEMM computations.\n\n"
+ << "Options:\n\n"
+ << " --help If specified, displays this usage statement.\n\n"
+ << " --m= GEMM M dimension\n"
+ << " --n= GEMM N dimension\n"
+ << " --k= GEMM K dimension\n"
+ << " --batch= Number of GEMM operations executed in one batch\n"
+ << " --alpha= Epilogue scalar alpha (real part)\n"
+ << " --alpha_i= Epilogue scalar alpha (imaginary part)\n"
+ << " --beta= Epilogue scalar beta (real part)\n\n"
+ << " --beta_i= Epilogue scalar beta (imaginary part)\n\n"
+ << " --iterations= Number of profiling iterations to perform.\n";
+
+ return out;
+ }
+
+ /// Compute performance in GFLOP/s
+ double gflops(double runtime_s) const {
+
+ // Number of real-valued multiply-adds
+ int64_t fmas = problem_size.product() * batch_count * 4;
+
+ // Two flops per multiply-add
+ return 2.0 * double(fmas) / double(1.0e9) / runtime_s;
+ }
+};
+
+/// Performance test environment for planar complex
+class DSA_FTD_ComplexGEMM_CUTLASS {
+
+ // Half-precision input and output
+ using Element = half_t;
+
+ // Configurations for layouts and internal computation
+ using LayoutA = ColumnMajor;
+ using LayoutB = ColumnMajor;
+ using LayoutC = ColumnMajor;
+ using ElementCompute = float;
+ using ElementAccumulator = float;
+
+ Handle handle;
+
+ GemmCoord problem_size;
+ int batch_count;
+ DeviceAllocation tensor_A;
+ DeviceAllocation tensor_B;
+ DeviceAllocation tensor_C;
+ DeviceAllocation tensor_D;
+ DeviceAllocation tensor_D_ref;
+
+ DeviceAllocation ptr_A_real;
+ DeviceAllocation ptr_A_imag;
+ DeviceAllocation ptr_B_real;
+ DeviceAllocation ptr_B_imag;
+ DeviceAllocation ptr_C_real;
+ DeviceAllocation ptr_C_imag;
+ DeviceAllocation ptr_D_real;
+ DeviceAllocation ptr_D_imag;
+
+ Element *ptr_A;
+ Element *ptr_B;
+ Element *ptr_C;
+ Element *ptr_D;
+
+ int64_t batch_stride_A;
+ int64_t batch_stride_B;
+ int64_t batch_stride_C;
+ int64_t batch_stride_D;
+
+ typename LayoutA::Stride::Index lda;
+ typename LayoutB::Stride::Index ldb;
+ typename LayoutC::Stride::Index ldc;
+ typename LayoutC::Stride::Index ldd;
+
+ int64_t imag_stride_A;
+ int64_t imag_stride_B;
+ int64_t imag_stride_C;
+ int64_t imag_stride_D;
+
+public:
+ // Constructors
+ DSA_FTD_ComplexGEMM_CUTLASS(Options const &options);
+ DSA_FTD_ComplexGEMM_CUTLASS();
+
+ // Methods
+ void initialize();
+ Result run(Options const &options);
+
+ bool testing;
+};
+
diff --git a/include/dsaX.h b/include/dsaX.h
new file mode 100644
index 0000000..f370bc0
--- /dev/null
+++ b/include/dsaX.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// Expose the use to compile time definitions,
+// enums, parameters, and classes
+#include "dsaX_def.h"
+#include "enums.h"
+#include "params.h"
+#include "fast_time_domain.h"
+
+// Use manual transpose route
+// Uncomment to try new pure cuBLAS
+//#define OLD_BLAS
+
+/**
+ * Initialize the library. This function will initialise
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas.
+ * @param[in] device_ordinal The GPU device to init
+ */
+void dsaXInit(int device_ordinal = -1);
+
+/**
+ * Finalize the library. This function will finalize
+ * a device if using CUDA and any BLAS libraries that are
+ * enabled, such as cublas. It will also dump any statistics
+ * collected, such as performance metrics.
+ */
+void dsaXEnd();
+
+/**
+ * This function will allocate pinned device memory of the
+ * given size in bytes, and return a void pointer to that
+ * memory. The user may delete the memory safely in their
+ * application code.
+ * @param[in] size The byte size of pinned memory to be allocated
+ * by dsaX.
+ */
+void *dsaXHostRegister(size_t size);
+
+/**
+ * This function allows the user to inspect the (4b,4b) char sized
+ * complex data at byte address i on the host. If 'non-zero' is true
+ * then the complex element will print only if either the real
+ * or imaginary element is non-zero. Useful for checking if
+ * an array is populated.
+ * @param[in] input The (4b,4b) char input array
+ * @param[in] i The ith element of the array
+ * @param[in] non-zero If true, print only elements with non-zero values
+ */
+void inspectPackedData(char input, int i, bool non_zero = false);
diff --git a/include/dsaX_api.h b/include/dsaX_api.h
new file mode 100644
index 0000000..3767600
--- /dev/null
+++ b/include/dsaX_api.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include
+
+#include "enums.h"
+
+#define STRINGIFY__(x) #x
+#define __STRINGIFY__(x) STRINGIFY__(x)
+
+/**
+ @brief Wrapper around cudaMemcpy or driver API equivalent
+ @param[out] dst Destination pointer
+ @param[in] src Source pointer
+ @param[in] count Size of transfer
+ @param[in] kind Type of memory copy
+*/
+void dsaXMemcpy_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const char *func, const char *file,
+ const char *line);
+
+/**
+ @brief Wrapper around cudaMemcpyAsync or driver API equivalent
+ @param[out] dst Destination pointer
+ @param[in] src Source pointer
+ @param[in] count Size of transfer
+ @param[in] kind Type of memory copy
+ @param[in] stream Stream to issue copy
+*/
+void dsaXMemcpyAsync_(void *dst, const void *src, size_t count, dsaXMemcpyKind kind, const cudaStream_t &stream,
+ const char *func, const char *file, const char *line);
+
+
+#define dsaXMemcpy(dst, src, count, kind) \
+ ::dsaXMemcpy_(dst, src, count, kind, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
+
+#define dsaXMemcpyAsync(dst, src, count, kind, stream) \
+ ::dsaXMemcpyAsync_(dst, src, count, kind, stream, __func__, file_name(__FILE__), __STRINGIFY__(__LINE__))
diff --git a/src/dsaX_capture.h b/include/dsaX_capture.h
similarity index 100%
rename from src/dsaX_capture.h
rename to include/dsaX_capture.h
diff --git a/src/dsaX_capture_manythread.h b/include/dsaX_capture_manythread.h
similarity index 100%
rename from src/dsaX_capture_manythread.h
rename to include/dsaX_capture_manythread.h
diff --git a/src/dsaX_capture_pcap.h b/include/dsaX_capture_pcap.h
similarity index 100%
rename from src/dsaX_capture_pcap.h
rename to include/dsaX_capture_pcap.h
diff --git a/include/dsaX_def.h b/include/dsaX_def.h
new file mode 100644
index 0000000..5b3af78
--- /dev/null
+++ b/include/dsaX_def.h
@@ -0,0 +1,100 @@
+#pragma once
+
+// default dada block keys
+#define TEST_BLOCK_KEY 0x0000aada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*4=37748736 for 4 CHANG 1 SNAP 1 REORDER
+// 128*3*384*32*2*8=75497472 for 1 CHANG 1 SNAP 8 REORDER
+#define CAPTURE_BLOCK_KEY 0x0000dada // for capture program.
+// 128*3*384*32*2=9437184 for 1 CHANG 1 SNAP 1 REORDER
+// 150994944 for doSnap
+#define REORDER_BLOCK_KEY 0x0000eada // for reorder program.
+// 589824 for doSnap
+#define REORDER_BLOCK_KEY2 0x0000bada // for reorder program 2.
+// 128*32*1536*16*2*2=402653184 1 REORDER
+// 3221225472 for 8 REORDERS
+#define XGPU_BLOCK_KEY 0x0000fada // for xgpu program.
+// 136*1536*2*8=3342336
+#define COPY_BLOCK_KEY 0x0000dbda // for split off data
+#define BF_BLOCK_KEY 0x0000dcda // for beamformed data
+#define BF_BLOCK_KEY2 0x0000bcda // for beamformed data testing
+#define CAPTURED_BLOCK_KEY 0x0000abda // for capture program.
+#define BEAMCAPTURE_BLOCK_KEY 0x0000bbda // for capture bf program.
+
+// constants
+#define PI 3.14159265359
+#define CVAC 299792458.0
+
+// default number of XGPU ints
+#define NCORRINTS 128
+#define NNATINTS 32 // native number of integrations
+#define NREORDERS 1 // number of ints per reorder
+
+// size of xgpu output
+// TODO
+#define XGPU_SIZE 835584 // size of single output vector (post-GPU)
+#define XGPU_IN_INC 1 // size of input increment
+#define NBASE 4656 // nant*(nant+1)/2
+#define NPOL 2
+#define NCOMPLEX 2 // two reals per complex
+#define NCHAN 1536 // regardless of NCHANG
+
+// default port for packet capture
+#define CAPTURE_PORT 4011
+
+// default UDP packet dims
+#define UDP_HEADER 8 // size of header/sequence number
+#define UDP_DATA 4608 // obs bytes per packet
+#define UDP_PAYLOAD 4616 // header + datasize
+
+// number of channel groups to expect
+#define NCHANG 1
+
+// number of SNAPs to expect
+#define NSNAPS 32
+
+/* expect consecutive channel groups */
+#define CHOFF 1024 // offset in channels of first group
+
+// default control ports
+#define CAPTURE_CONTROL_PORT 11223
+#define REORDER_CONTROL_PORT 11224
+#define XGPU_CONTROL_PORT 11225
+#define WRITEVIS_CONTROL_PORT 11226
+#define TRIGGER_CONTROL_PORT 11227
+
+#define NPACKETS_PER_CALL 2048
+#define NPACKETS_PER_BLOCK 2048
+#define NPACKETS_INTS 2048 // number of packets per xgpu int
+#define NPACKETS_PER_FIL 2
+#define NPACKETS 2048
+#define NOUTBLOCKS 15 // number of input blocks stored by trigger
+#define NANTS 96
+#define NCHAN_PER_PACKET 384
+#define NBEAMS 512
+
+// for beamformer
+//#define sep 1.0 // arcmin
+#define NW 48 // number of weights per 384 chans. Also the number of channels formed
+#define NANT 63
+#define BEAM_OUT 23
+#define NSTREAMS 4
+#define NBP 8 // number of previous BPs to average
+
+// for second corner turn
+#define FIL_PORT0 6625 // port for first chan group
+#define NCLIENTS 16 // number of client dbnic processes to expect
+#define NSAMPS_PER_BLOCK 16384 // number of samples per block
+#define NCHAN_FIL 1024 // final number of filterband chans
+#define NBEAMS_PER_BLOCK 64 // number of beams to expect
+#define NSAMPS_PER_TRANSMIT 512 // number of samples transmitted at one time
+#define NBMS 256
+#define P_SIZE 4108
+#define NWAIT 100000
+
+// required to prevent overflow in corr matrix multiply
+#define halfFac 4
+
+// beam sep
+#define sep 1.0 // arcmin
+
diff --git a/include/dsaX_malloc.h b/include/dsaX_malloc.h
new file mode 100644
index 0000000..04d24b0
--- /dev/null
+++ b/include/dsaX_malloc.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include
+#include
+#include // for getpagesize()
+#include // for backtrace
+#include